def test_csv_file_header_always_the_same(self): """ Check that csv files have always the same order in their header. """ csv_outfile_1 = os.path.join(base, 'data/tmp/csvout1.csv') csv_outfile_2 = os.path.join(base, 'data/tmp/csvout2.csv') config = { 'keyword': 'some words', 'search_engines': all_search_engines, 'num_pages_for_keyword': 2, 'scrape_method': 'selenium', 'cachedir': os.path.join(base, 'data/csv_tests/'), 'do_caching': True, 'verbosity': 0, 'output_filename': csv_outfile_1, } search = scrape_with_config(config) search = scrape_with_config(config) config.update({'output_filename': csv_outfile_2}) search = scrape_with_config(config) assert os.path.isfile(csv_outfile_1) and os.path.isfile(csv_outfile_2) file1 = open(csv_outfile_1, 'rt') file2 = open(csv_outfile_2, 'rt') import csv reader1, reader2 = csv.DictReader(file1), csv.DictReader(file2) header1, header2 = reader1.fieldnames, reader2.fieldnames from GoogleScraper.output_converter import csv_fieldnames assert header1 == header2 == csv_fieldnames
def test_csv_output_static(self): """Test csv output. Test parsing 4 html pages with two queries and two pages per query and transforming the results to csv format. The cached file should be saved in 'data/csv_tests/', there should be as many files as search_engine * pages_for_keyword The keyword used in the static SERP pages MUST be 'some words' The filenames must be in the GoogleScraper cache format. """ import csv from GoogleScraper.output_converter import csv_fieldnames number_search_engines = len(all_search_engines) csv_outfile = os.path.join(base, 'data/tmp/csv_test.csv') config = { 'keyword': 'some words', 'search_engines': all_search_engines, 'num_pages_for_keyword': 2, 'scrape_method': 'selenium', 'cachedir': os.path.join(base, 'data/csv_tests/'), 'do_caching': True, 'verbosity': 0, 'output_filename': csv_outfile, } scrape_with_config(config) assert os.path.exists(csv_outfile), '{} does not exist'.format( csv_outfile) reader = csv.reader(open(csv_outfile, 'rt')) # the items that should always have a value: notnull = ('link', 'query', 'rank', 'domain', 'title', 'link_type', 'scrape_method', 'page_number', 'search_engine_name', 'snippet') for rownum, row in enumerate(reader): if rownum == 0: header = row header_keys = set(row) assert header_keys.issubset(set( csv_fieldnames)), 'Invalid CSV header: {}'.format(header) for item in notnull: assert row[header.index( item)], '{} has a item that has no value: {}'.format( item, row) self.assertAlmostEqual(number_search_engines * 2 * 10, rownum, delta=30)
def test_json_output_static(self): """Test json output. """ import json number_search_engines = len(all_search_engines) json_outfile = os.path.join(base, 'data/tmp/json_test.json') config = { 'keyword': 'some words', 'search_engines': all_search_engines, 'num_pages_for_keyword': 2, 'scrape_method': 'selenium', 'cachedir': os.path.join(base, 'data/json_tests/'), 'do_caching': True, 'verbosity': 0, 'output_filename': json_outfile } scrape_with_config(config) assert os.path.exists(json_outfile), '{} does not exist'.format( json_outfile) file = open(json_outfile, 'r') try: results = json.load(file) except ValueError as e: print('Cannot parse output json file {}. Reason: {}'.format( json_outfile, e)) raise e # the items that should always have a value: notnull = ('link', 'rank', 'domain', 'title', 'link_type') num_results = 0 for item in results: for k, v in item.items(): if k == 'results': for res in v: num_results += 1 for item in notnull: assert res[ item], '{} has a item that has no value: {}'.format( item, res) self.assertAlmostEqual(number_search_engines * 2 * 10, num_results, delta=30)
def insertNametoConfig_Search(name, config): #config['keyword'] = name + ' "Saas" rds.fightmetric.com' config['keyword'] = name + ' site:www.fightmetric.com' config['output_filename'] = name + ".json" print("PRINTING WITH PHRASE: " + config['keyword']) try: search = scrape_with_config(config) data = readinJsonSearch(name) print(str(data)) url_json = {} for res in data[0]['results']: #matches last name only...but search is for full name....so should be right 99.9%...? g = re.search(r'(.+)\s(.+)', name) n1 = str(g.group(2)) if n1 in res['title']: url_json = {'name': name, 'url': str(res['link'])} os.remove(name + '.json') with open(name + '.json', 'w') as outfile: json.dump(url_json, outfile) data = readinJsonSearch(name) #man, no fightmetric on "num_results": "0", ## "query": "Khalid Murtazaliev site:www.fightmetric.com", ## "requested_at": "2018-09-12 01:21:22.325246", except GoogleSearchError as e: print(e) try: if data["num_results"] != "0": print("returning none") return None except: return data
def test_no_results_serp_object(self): config = { 'SCRAPING': { 'keyword': 'asdfasdfa7654567654345654343sdfasd', 'search_engines': '*', # all available search engines 'num_pages_for_keyword': 1, 'scrape_method': 'selenium' }, 'GLOBAL': { 'cachedir': 'data/no_results/', 'do_caching': 'True', 'verbosity': 1 } } search = scrape_with_config(config) assert search.number_search_engines_used == len(all_search_engines) assert len(search.used_search_engines.split(',')) == len(search.used_search_engines.split(',')) assert search.number_proxies_used == 1 assert search.number_search_queries == 1 assert search.started_searching < search.stopped_searching assert len(all_search_engines) == len(search.serps), 'Not enough results. Expected: {}, got {}'.format(len(all_search_engines), len(search.serps)) for serp in search.serps: assert serp.has_no_results_for_query(), 'num_results must be 0 but is {}. {}'.format(serp.num_results, serp.links) # some search engine do alternative searches instead of yielding # nothing at all. if serp.search_engine_name in ('google', 'bing'): assert serp.effective_query, '{} must have an effective query when a keyword has no results.'.format(serp.search_engine_name)
def basic_usage(): # See in the config.cfg file for possible values config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword': 'Let\'s go bubbles!', 'search_engines': 'yandex', 'num_pages_for_keyword': 1 }, 'SELENIUM': { 'sel_browser': 'chrome', }, 'GLOBAL': { 'do_caching': 'False' } } try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print(e) # let's inspect what we got for search in sqlalchemy_session.query(ScraperSearch).all(): for serp in search.serps: print(serp) for link in serp.links: print(link)
def fetchImages(query): config = { 'keyword': query, 'search_engines': ['yandex'], 'search_type': 'image', 'scrape_method': 'selenium', 'do_caching': True, 'log_level': 'CRITICAL', 'print_results': 'summary', 'output_format': '' } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) image_urls = [] for serp in search.serps: image_urls.extend( [link.link for link in serp.links] ) max_num_of_images = 2 images = [] i = 0 for image_url in image_urls: images.append(unquote(image_url)) i += 1 if i > max_num_of_images: break return images
def test_no_results_serp_object(self): config = { "keyword": "asdfasdfa7654567654345654343sdfasd", "search_engines": all_search_engines, "num_pages_for_keyword": 1, "scrape_method": "selenium", "cachedir": os.path.join(base, "data/no_results/"), "do_caching": True, "verbosity": 1, } search = scrape_with_config(config) assert search.number_search_engines_used == len(all_search_engines) assert len(search.used_search_engines.split(",")) == len(search.used_search_engines.split(",")) assert search.number_proxies_used == 1 assert search.number_search_queries == 1 assert search.started_searching < search.stopped_searching assert len(all_search_engines) == len(search.serps), "Not enough results. Expected: {}, got {}".format( len(all_search_engines), len(search.serps) ) for serp in search.serps: assert serp.has_no_results_for_query(), "num_results must be 0 but is {}. {}".format( serp.num_results, serp.links ) # some search engine do alternative searches instead of yielding # nothing at all. if serp.search_engine_name in ("google", "bing"): assert serp.effective_query, "{} must have an effective query when a keyword has no results.".format( serp.search_engine_name )
def basic_usage(): # See in the config.cfg file for possible values generate_sub_queries(300) config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword_file': "queries.txt", 'search_engines': 'bing,baidu,yandex', 'num_workers': 8, 'num_pages_for_keyword': 20, 'scrape_method': 'http', }, 'SELENIUM': { 'sel_browser': 'chrome', 'num_workers': 4, }, 'GLOBAL': { 'do_caching': 'True' }, 'OUTPUT': { 'output_filename': 'out.txt', } } try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print(e)
def get_image_search_result(query): print('Searching image [{}]'.format(query)) global config config['keyword'] = query config['search_type'] = 'image' config['search_engines'] = 'yandex' try: search = scrape_with_config(config) serp = search.serps[0] top_result = random.choice(serp.links) url = top_result.link url = urllib.parse.unquote(url) content = requests.get(url).content except Exception as e: print(e) content = '没搜到啊' try: with open('tmp.jpg', 'wb') as f: f.write(content) info = convert_image('tmp.jpg', 'tmp.jpg') except: info = '' #return 'tmp.jpg' return '{} {}'.format(url, info) # for qq temporarily
def analizar_keyword(keyword): # Configuracion de GoogleScrap pdb.set_trace() config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword': keyword, 'search_engines': 'bing', 'num_pages_for_keyword': 3 }, 'SELENIUM': { 'sel_browser': 'chrome', }, 'GLOBAL': { 'do_caching': 'False' } } try: pdb.set_trace() sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print(e) # Inspeccion pdb.set_trace() for search in sqlalchemy_session.query(ScraperSearch).all(): for serp in search.serps: print(serp) for link in serp.links: print(link)
def getTopResultsFromGoogle(word): keywords = [] keywords.append(word) config = { 'use_own_ip': 'True', 'keywords': keywords, 'search_engines': [ 'google', ], 'num_pages_for_keyword': 2, 'scrape_method': 'http', 'do_caching': 'False' } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) results = [] count = 0 if search.serps[0].page_number == 2: search.serps.reverse() for serp in search.serps: print(serp) for link in serp.links: if (count == 10): break results.append(link.link) count += 1 return results
def test_no_results_serp_object(self): config = { 'keyword': 'asdfasdfa7654567654345654343sdfasd', 'search_engines': all_search_engines, 'num_pages_for_keyword': 1, 'scrape_method': 'selenium', 'cachedir': os.path.join(base, 'data/no_results/'), 'do_caching': True, 'verbosity': 1, } search = scrape_with_config(config) assert search.number_search_engines_used == len(all_search_engines) assert len(search.used_search_engines.split(',')) == len( search.used_search_engines.split(',')) assert search.number_proxies_used == 1 assert search.number_search_queries == 1 assert search.started_searching < search.stopped_searching assert len(all_search_engines) == len( search.serps), 'Not enough results. Expected: {}, got {}'.format( len(all_search_engines), len(search.serps)) for serp in search.serps: assert serp.has_no_results_for_query( ), 'num_results must be 0 but is {}. {}'.format( serp.num_results, serp.links) # some search engine do alternative searches instead of yielding # nothing at all. if serp.search_engine_name in ('google', 'bing'): assert serp.effective_query, '{} must have an effective query when a keyword has no results.'.format( serp.search_engine_name)
def test_csv_output_static(self): """Test csv output. Test parsing 4 html pages with two queries and two pages per query and transforming the results to csv format. The cached file should be saved in 'data/csv_tests/', there should be as many files as search_engine * pages_for_keyword The keyword used in the static SERP pages MUST be 'some words' The filenames must be in the GoogleScraper cache format. """ import csv from GoogleScraper.output_converter import csv_fieldnames number_search_engines = len(all_search_engines) csv_outfile = os.path.join(base, "data/tmp/csv_test.csv") config = { "keyword": "some words", "search_engines": all_search_engines, "num_pages_for_keyword": 2, "scrape_method": "selenium", "cachedir": os.path.join(base, "data/csv_tests/"), "do_caching": True, "verbosity": 0, "output_filename": csv_outfile, } search = scrape_with_config(config) assert os.path.exists(csv_outfile), "{} does not exist".format(csv_outfile) reader = csv.reader(open(csv_outfile, "rt")) # the items that should always have a value: notnull = ( "link", "query", "rank", "domain", "title", "link_type", "scrape_method", "page_number", "search_engine_name", "snippet", ) for rownum, row in enumerate(reader): if rownum == 0: header = row header_keys = set(row) assert header_keys.issubset(set(csv_fieldnames)), "Invalid CSV header: {}".format(header) for item in notnull: assert row[header.index(item)], "{} has a item that has no value: {}".format(item, row) self.assertAlmostEqual(number_search_engines * 2 * 10, rownum, delta=30)
def saveLink(query): # See in the config.cfg file for possible values try: if query: file_name = query.replace(" ", "_") self.config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword': query, 'search_engines': 'bing', 'num_pages_for_keyword': 1, 'scrape_method': 'http' }, 'SELENIUM': { 'sel_browser': 'chrome', }, 'OUTPUT': { 'output_filename': "path/" + file_name + ".json" }, 'GLOBAL': { 'do_caching': 'False' } } raw_html = "" sqlalchemy_session = scrape_with_config(self.config) except Exception: import traceback print(traceback.format_exc())
def ScrapLinksFromBrowser(self): # See in the config.cfg file for possible values global config config = { 'use_own_ip': True, 'keyword': 'security brigade', 'search_engines': ['Google', 'Bing', 'Yahoo', 'Yandex', 'Baidu', 'Duckduckgo'], 'num_pages_for_keyword': 2, 'scrape_method': 'selenium', 'sel_browser': 'chrome', } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) # let's inspect what we got for serp in search.serps: # print(serp) # print(serp.search_engine_name) # print(serp.scrape_method) # print(serp.page_number) # print(serp.requested_at) # print(serp.num_results) # ... more attributes ... for link in serp.links: self.listoflink.append(link)
def basic_usage(): # See in the config.cfg file for possible values generate_sub_queries(300) json_outfile = 'data/tmp/json_test.json' config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword_file': "queries.txt", 'search_engines': 'bing,baidu', 'num_pages_for_keyword': 10, 'scrape_method': 'http', 'num_workers': 8, }, 'GLOBAL': { 'cachedir': 'data/json_tests/', 'do_caching': 'True', 'verbosity': 0 }, 'OUTPUT': { 'output_filename': json_outfile } } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e)
def basic_search(query, engines, pages): # See in the config.cfg file for possible values config = { 'use_own_ip': True, 'keyword': query, 'search_engines': [engines], 'num_pages_for_keyword': pages, 'scrape_method': 'http', 'loglevel': 'WARN', 'print_results': 'summarize', 'do_caching': False } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) # let's inspect what we got ''' for serp in search.serps: #print(serp) print(serp.status) print(serp.scrape_method) print(serp.page_number) print(serp.requested_at) print(serp.num_results) for link in serp.links: print(link) ''' return search
def image_search(key_phrase, threads_count, pages_count, target_directory, search_engines=None): if not search_engines: search_engines = ['google', 'baidu', 'yandex', 'bing', 'yahoo'] config = { 'keywords': [key_phrase], 'search_engines': search_engines, 'search_type': 'image', 'scrape_method': 'selenium', 'do_caching': False, 'num_pages_for_keyword': str(pages_count) } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) image_urls = [] print("\t\t\t --- ", len(image_urls)) for serp in search.serps: image_urls.extend([link.link for link in serp.links]) print('[i] Going to scrape {num} images and saving them in "{dir}"'. format(num=len(image_urls), dir=target_directory)) try: os.mkdir(target_directory) except FileExistsError: pass # fire up 100 threads to get the images - threads_count threads = [ FetchResource(target_directory, []) for i in range(threads_count) ] while image_urls: for t in threads: try: t.urls.append(image_urls.pop()) except IndexError as e: break threads = [t for t in threads if t.urls] for t in threads: t.start() for t in threads: t.join() return True
def crawl_data(keyword): file_num = 0 output_filename = './crawling_output/output_{}.csv'.format(file_num) params = { 'keyword': keyword + ' site:www.quora.com', 'num_pages': 2, 'filename': output_filename, } config = get_config(**params) title_list = [] title_origin_list = [] similarity_list = [] link_list = [] dict_idx = 0 output_dict = {} try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) else: # 검색 결과를 확인하는 함수 # test_google_search(search) # open scv file with open(output_filename, 'r', newline='') as csv_file: # csv_reader = csv.reader(csv_file, delimiter=',') csv_reader = csv.DictReader(csv_file, delimiter=',') for row in csv_reader: title_origin = row['title'] title = row['title'] link = row['link'] # title 에서 부제 제거 # 'title - src site'와 같이 - or | 있으면 자르기 title = preprocess_title(title) # dictionary element 만들어서 추가 dict_element = { 'title': title, 'title_origin': title_origin, 'similarity': 0.0, 'link': link, } output_dict[dict_idx] = dict_element title_list.append(title) title_origin_list.append(title_origin) link_list.append(row['link']) dict_idx += 1 # 없으면 문장 그대로 csv_file.close() return title_list, link_list
def image_search(): # See in the config.cfg file for possible values config = { 'SCRAPING': { 'keyword': 'snow nature', 'search_engines': 'yandex,google,bing,duckduckgo,yahoo,baidu', 'search_type': 'image', 'scrapemethod': 'http' } } try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print(e) image_urls = [] search = sqlalchemy_session.query(ScraperSearch).all()[-1] for serp in search.serps: image_urls.extend( [link.link for link in serp.links] ) import threading,requests, os class FetchResource(threading.Thread): """Grabs a web resource and stores it in the target directory""" def __init__(self, target, urls): super().__init__() self.target = target self.urls = urls def run(self): for url in self.urls: with open(os.path.join(self.target, url.split('/')[-1]), 'wb') as f: f.write(requests.get(url).content) # make a directory for the results os.mkdir('images') # fire up 100 threads to get the images num_threads = 100 threads = [FetchResource('images/', []) for i in range(num_threads)] while image_urls: for t in threads: t.urls.append(image_urls.pop()) threads = [t for t in threads if t.urls] for t in threads: t.start() for t in threads: t.stop()
def fetchBlogUrls(city_a, city_b): print('\n\nSleeping for {} sec...'.format(sleepTime)) time.sleep(sleepTime) query = 'places to visit between ' + city_a + ' and ' + city_b + ' blogs' print('fetching : ' + query + '...') config = { 'use_own_ip': True, 'keyword': query, 'search_engines': ['bing'], 'num_pages_for_keyword': 2, 'scrape_method': 'selenium', 'sel_browser': 'chrome', 'do_caching': False, 'log_level': 'CRITICAL', 'print_results': 'summary', 'output_format': '' } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) urls = [] #pprint(search) for serp in search.serps: for link in serp.links: print(link.getLink()) urls.append(link.getLink()) #pauseInterval = random.uniform(1, 20) #print(pauseInterval) #urls = search(query, stop=20, pause=pauseInterval)#last result to retrieve #return urls #sources = [ #"http://kskrishnan.blogspot.com/2010/09/bangalore-to-mysore.html", #"https://www.makemytrip.com/blog/mysore-tales-1-making-way-to-mysores-hotspots", #"http://rajivc-food-travel-life.blogspot.com/2015/05/trip-to-gods-own-country-bangalore-to.html" #] #ca_sources = [ #"https://www.tripline.net/trip/San_Francisco_to_San_Diego_on_the_PCH-7521703244561003A0278A25A729E901", #"https://www.gapyear.com/articles/216212/13-incredible-stops-on-the-pacific-coast-highway", #"http://moon.com/2015/08/road-trip-itinerary-san-diego-to-san-francisco-in-two-days/", #"http://moon.com/2016/05/take-a-two-week-california-coast-road-trip/", #"http://californiathroughmylens.com/pacific-coast-highway-stops", #"http://californiathroughmylens.com/san-francisco-mendocino-guide", #"http://www.heleninwonderlust.co.uk/2014/03/ultimate-california-road-trip-itinerary-las-vegas/", #"http://www.worldofwanderlust.com/where-to-stop-on-the-pacific-coast-highway/", #"http://www.visitcalifornia.com/trip/highway-one-classic", #"http://independenttravelcats.com/2015/11/24/planning-a-california-pacific-coast-highway-road-trip-from-san-francisco-to-los-angeles/" #] #ca_sources1 = [ #"https://www.tripline.net/trip/San_Francisco_to_San_Diego_on_the_PCH-7521703244561003A0278A25A729E901", #"https://www.gapyear.com/articles/216212/13-incredible-stops-on-the-pacific-coast-highway"] #print(urls) return urls
def basic_usage(products_parsed): local_anti = 0 # See in the config.cfg file for possible values keywords = [y for x, y in products_parsed] config = { 'use_own_ip': 'True', 'search_engines': [ 'bing', ], 'num_pages_for_keyword': 1, 'num_results_per_page': 20, 'num_workers': step, 'keywords': keywords, 'SELENIUM': { 'sel_browser': 'chrome', }, 'do_caching': 'True' } try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print(e) # let's inspect what we got serps = sqlalchemy_session.serps loop = dict() for it, serp in enumerate(serps): loop[serp.query] = list() for link in serp.links: loop[serp.query].append({'link': link.link, 'title': link.title}) for it in products_parsed: links = loop.get(it[1], None) if not links: local_anti += 1 continue for link in links: if 'product' in link['link'] and 'instacart' in link['link']: req = requests.get(url=link['link']) if req.status_code != 404: product_list[it[0]]['link'] = link['link'] product_list[it[0]]['title'] = link['title'] product_list[it[0]]['content'] = req.content break else: product_list[it[0]]['link'] = link['link'] product_list[it[0]]['title'] = link['title'] product_list[it[0]]['content'] = None if not product_list[it[0]].get('link', False): local_anti += 1 return local_anti
def image_search(): # See in the config.cfg file for possible values config = { 'SCRAPING': { 'keyword': 'snow nature', 'search_engines': 'yandex,google,bing,duckduckgo,yahoo,baidu', 'search_type': 'image', 'scrapemethod': 'http' } } try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print(e) image_urls = [] search = sqlalchemy_session.query(ScraperSearch).all()[-1] for serp in search.serps: image_urls.extend([link.link for link in serp.links]) import threading, requests, os class FetchResource(threading.Thread): """Grabs a web resource and stores it in the target directory""" def __init__(self, target, urls): super().__init__() self.target = target self.urls = urls def run(self): for url in self.urls: with open(os.path.join(self.target, url.split('/')[-1]), 'wb') as f: f.write(requests.get(url).content) # make a directory for the results os.mkdir('images') # fire up 100 threads to get the images num_threads = 100 threads = [FetchResource('images/', []) for i in range(num_threads)] while image_urls: for t in threads: t.urls.append(image_urls.pop()) threads = [t for t in threads if t.urls] for t in threads: t.start() for t in threads: t.stop()
def test_csv_output_static(self): """Test csv output. Test parsing 4 html pages with two queries and two pages per query and transforming the results to csv format. The cached file should be saved in 'data/csv_tests/', there should be as many files as search_engine * pages_for_keyword The keyword used in the static SERP pages MUST be 'some words' The filenames must be in the GoogleScraper cache format. """ import csv from GoogleScraper.output_converter import csv_fieldnames number_search_engines = len(all_search_engines) csv_outfile = 'data/tmp/csv_test.csv' config = { 'SCRAPING': { 'keyword': 'some words', 'search_engines': ','.join(all_search_engines), 'num_pages_for_keyword': 2, 'scrape_method': 'selenium' }, 'GLOBAL': { 'cachedir': 'data/csv_tests/', 'do_caching': 'True', 'verbosity': 0 }, 'OUTPUT': { 'output_filename': csv_outfile } } search = scrape_with_config(config) assert os.path.exists(csv_outfile), '{} does not exist'.format(csv_outfile) reader = csv.reader(open(csv_outfile, 'rt')) # the items that should always have a value: notnull = ('link', 'query', 'rank', 'domain', 'title', 'link_type', 'scrape_method', 'page_number', 'search_engine_name', 'snippet') for rownum, row in enumerate(reader): if rownum == 0: header = row header_keys = set(row) assert header_keys.issubset(set(csv_fieldnames)), 'Invalid CSV header: {}'.format(header) for item in notnull: assert row[header.index(item)], '{} has a item that has no value: {}'.format(item, row) self.assertAlmostEqual(number_search_engines * 2 * 10, rownum, delta=30)
def extract_urls(keywords_file, companies_list, proxy_list=None): ''' Use GoogleScraper to extract URLs based on the combination of company name and keywords, it will store the result in a .csv file and return the path and name of the file. Input: the path of the keywords file and company list. Should be txt file or file without format. Output: it will automatically create a json file and a sqlite db file to store all the query result and return the path of that file ''' if proxy_list: proxy_file = proxy_list else: full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) # print(path, filename) proxy_file = os.path.join(path, "ProxyProvider", "proxy.txt") query = create_query(keywords_file, companies_list) if len(query) == 0: print("All queries been scraped") return companies_list + '.json' config = { 'use_own_ip': False, 'keywords': query, 'check_proxies': False, 'search_engines': 'google', 'stop_on_detection': False, # 'google_sleeping_ranges': 5, 'num_pages_for_keyword': 1, 'scrape_method': 'selenium', # http or selenium 'sel_browser': 'Phantomjs', 'num_workers': 1, 'verbosity': 2, 'do_caching': False, # 'sleeping_ranges': '5: 5, 10', 'google_search_url': 'http://www.google.com/search?', 'proxy_file': proxy_file, 'output_filename': companies_list + '.json', 'database_name': companies_list, } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) if companies_list.count('.') > 0: return companies_list[:companies_list.rindex('.')] + '.json' else: return companies_list + '.json'
def test_json_output_static(self): """Test json output. """ import json number_search_engines = len(all_search_engines) json_outfile = 'data/tmp/json_test.json' config = { 'SCRAPING': { 'keyword': 'some words', 'search_engines': ','.join(all_search_engines), 'num_pages_for_keyword': 2, 'scrape_method': 'selenium' }, 'GLOBAL': { 'cachedir': 'data/json_tests/', 'do_caching': 'True', 'verbosity': 0 }, 'OUTPUT': { 'output_filename': json_outfile } } search = scrape_with_config(config) assert os.path.exists(json_outfile), '{} does not exist'.format(json_outfile) file = open(json_outfile, 'r') try: results = json.load(file) except ValueError as e: print('Cannot parse output json file {}. Reason: {}'.format(json_outfile, e)) raise e # the items that should always have a value: notnull = ('link', 'rank', 'domain', 'title', 'link_type') num_results = 0 for item in results: for k, v in item.items(): if k == 'results': for res in v: num_results += 1 for item in notnull: assert res[item], '{} has a item that has no value: {}'.format(item, res) self.assertAlmostEqual(number_search_engines * 2 * 10, num_results, delta=30)
def run(self): n = self.param[ 'n'] * 10 # multiplier is set to collect far more results than required for establishing suggestions MAX_PER_PAGE = 100 # this is a limit imposed by GoogleScraper config = { 'use_own_ip': True, 'keyword': self.param['keywords'], 'search_engines': ['bing'], 'scrape_method': 'http', 'do_caching': False, 'log_level': self.verbose, 'num_pages_for_keyword': len(range(0, n, MAX_PER_PAGE)) if n > 0 else 1, 'num_results_per_page': min(n, MAX_PER_PAGE) if n > 0 else MAX_PER_PAGE, } if 'PROXY_FILE' in self.config and self.config['PROXY_FILE'] not in [ None, '' ]: config.update({ 'proxy_file': self.config['PROXY_FILE'], 'check_proxies': False }) # NB: check_proxies is a parameter aimed to make the (public) proxy address checked on a website, # so if using a private network proxy, this check is not required # scrape on keywords and get a connection to the cache database search = scrape_with_config(config) # check the status and raise an exception if scraping failed for serp in search.serps: if serp.status != 'successful' and serp.no_results: self.logger.error(serp.status) exit(1) # collect found links links, suggestions = [ link for serp in search.serps for link in serp.links ], [] k, l = 0, len(links) while len(suggestions) < self.param['n'] and k < l: # TODO: write a filter # e.g. for: # - favouring links with domain containing one or more of the keywords) # - excluding links on specific forums and/or download sites # - exluding maliious domains acording to Norton Safe Web or other security sources (e.g. Webputation) suggestions.append({ 'link': links[k].link, 'title': links[k].title, 'text': links[k].snippet }) k += 1 return suggestions
def test_all_search_engines_in_selenium_mode(self): """ Very simple test case that assures that scraping all search engines in selenium mode works. Basically copy paste from `test_all_search_engines_in_http_mode`. """ config = { 'keyword': 'dont look back in anger', 'search_engines': '*', 'scrape_method': 'selenium', 'sel_browser': 'chrome', 'browser_mode': 'headless', 'chromedriver_path': '/home/nikolai/projects/private/Drivers/chromedriver', 'do_caching': False, 'num_results_per_page': 10, } search = scrape_with_config(config) self.assertLess(search.started_searching, search.stopped_searching) self.assertEqual(search.number_proxies_used, 1) self.assertEqual(search.number_search_engines_used, len(all_search_engines)) self.assertEqual(search.number_search_queries, 1) self.assertEqual(len(search.serps), len(all_search_engines)) for i, serp in enumerate(search.serps): self.assertEqual(search.serps[i].page_number, 1) self.assertEqual(serp.status, 'successful') self.assertIn(serp.search_engine_name.lower(), all_search_engines) self.assertEqual(serp.scrape_method, 'selenium') self.assertTrue(serp.num_results_for_query) self.assertAlmostEqual(serp.num_results, 10, delta=2) self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query) self.assertEqual(serp.no_results, False) self.assertEqual(serp.num_results, len(serp.links)) for j, link in enumerate(serp.links): if link.link_type == 'results': self.assertTrue(is_string_and_longer_than(link.title, 3)) self.assertTrue(is_string_and_longer_than(link.snippet, 3)) self.assertTrue(is_string_and_longer_than(link.link, 10)) self.assertTrue(link.domain in link.link) self.assertTrue(isinstance(link.rank, int))
def test_csv_file_header_always_the_same(self): """ Check that csv files have always the same order in their header. """ csv_outfile_1 = os.path.join(base, "data/tmp/csvout1.csv") csv_outfile_2 = os.path.join(base, "data/tmp/csvout2.csv") config = { "keyword": "some words", "search_engines": all_search_engines, "num_pages_for_keyword": 2, "scrape_method": "selenium", "cachedir": os.path.join(base, "data/csv_tests/"), "do_caching": True, "verbosity": 0, "output_filename": csv_outfile_1, } search = scrape_with_config(config) search = scrape_with_config(config) config.update({"output_filename": csv_outfile_2}) search = scrape_with_config(config) assert os.path.isfile(csv_outfile_1) and os.path.isfile(csv_outfile_2) file1 = open(csv_outfile_1, "rt") file2 = open(csv_outfile_2, "rt") import csv reader1, reader2 = csv.DictReader(file1), csv.DictReader(file2) header1, header2 = reader1.fieldnames, reader2.fieldnames from GoogleScraper.output_converter import csv_fieldnames assert header1 == header2 == csv_fieldnames
def test_json_output_static(self): """Test json output. """ import json number_search_engines = len(all_search_engines) json_outfile = os.path.join(base, "data/tmp/json_test.json") config = { "keyword": "some words", "search_engines": all_search_engines, "num_pages_for_keyword": 2, "scrape_method": "selenium", "cachedir": os.path.join(base, "data/json_tests/"), "do_caching": True, "verbosity": 0, "output_filename": json_outfile, } search = scrape_with_config(config) assert os.path.exists(json_outfile), "{} does not exist".format(json_outfile) file = open(json_outfile, "r") try: results = json.load(file) except ValueError as e: print("Cannot parse output json file {}. Reason: {}".format(json_outfile, e)) raise e # the items that should always have a value: notnull = ("link", "rank", "domain", "title", "link_type") num_results = 0 for item in results: for k, v in item.items(): if k == "results": for res in v: num_results += 1 for item in notnull: assert res[item], "{} has a item that has no value: {}".format(item, res) self.assertAlmostEqual(number_search_engines * 2 * 10, num_results, delta=30)
def scrapeArt(self, artistName, albumName, filename): searchQuery = artistName + " " + albumName # Configure the scraper config = { "SCRAPING": { "keyword": searchQuery, "search_engines": "google", "search_type": "image", "scrape_method": "http", "num_results_per_page:": 1, "num_pages_for_keyword": 1, }, "GLOBAL": {"verbosity": "0", "do_caching": "False"}, } # Run the search and scrape results try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) # Save the first image search result image_url = search.serps[0].links[0].link # Parse image url into a usable format url = urllib.parse.unquote(image_url) # Write the image data to file target_directory = os.path.join(self.outputDir, artistName + "/") target_directory = os.path.join(target_directory, albumName + "/") fileExt = findFileExtension(url) print(self.albumName) if fileExt == url: print("File extension: ", fileExt, " is not supported.") print("Error occured on Artist: ", self.artistName, " Album: ", self.albumName) return finalFilename = filename + fileExt with open(os.path.join(target_directory, finalFilename), "wb") as f: try: content = requests.get(url).content f.write(content) except Exception as e: pass
def scrape_query(self, mode, search_engines='*', query='', random_query=False, sel_browser='Chrome'): if random_query: query = random_word() config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword': query, 'search_engines': search_engines, 'num_pages_for_keyword': 1, 'scrape_method': mode, }, 'GLOBAL': { 'do_caching': 'False', 'verbosity': 0 }, 'SELENIUM': { 'sel_browser': sel_browser } } search = scrape_with_config(config) if search_engines == '*': assert search.number_search_engines_used == len(all_search_engines) else: assert search.number_search_engines_used == len( search_engines.split(',')) if search_engines == '*': assert len(search.used_search_engines.split(',')) == len( all_search_engines) else: assert len(search.used_search_engines.split(',')) == len( search_engines.split(',')) assert search.number_proxies_used == 1 assert search.number_search_queries == 1 assert search.started_searching < search.stopped_searching return search
def test_all_search_engines_in_http_mode(self): """ Very simple test case that assures that scraping all search engines in http mode works. """ config = { 'keyword': 'in this world', 'search_engines': '*', 'scrape_method': 'http', 'do_caching': False, 'num_results_per_page': 10, 'log_level': 'WARNING', 'print_results': 'summarize', } search = scrape_with_config(config) self.assertLess(search.started_searching, search.stopped_searching) self.assertEqual(search.number_proxies_used, 1) self.assertEqual(search.number_search_engines_used, len(all_search_engines)) self.assertEqual(search.number_search_queries, 1) self.assertEqual(len(search.serps), len(all_search_engines)) for i, serp in enumerate(search.serps): self.assertEqual(search.serps[i].page_number, 1) self.assertEqual(serp.status, 'successful') self.assertIn(serp.search_engine_name.lower(), all_search_engines) self.assertEqual(serp.scrape_method, 'http') self.assertTrue(serp.num_results_for_query) self.assertTrue(serp.num_results >= 7) self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query) self.assertEqual(serp.num_results, len(serp.links)) for j, link in enumerate(serp.links): if link.link_type == 'results': self.assertTrue(is_string_and_longer_than(link.title, 3)) # no snippet needed actually # self.assertTrue(is_string_and_longer_than(link.snippet, 3)) self.assertTrue(is_string_and_longer_than(link.link, 10)) self.assertTrue(link.domain in link.link) self.assertTrue(isinstance(link.rank, int))
def slat_(self, config): try: if str('wiki') in config['search_engines']: get_links = (str('wikipedia'), 0, None, config['keyword'], None) wiki_get(get_links) elif str('info_wars') in config['search_engines']: get_links = (str('info_wars'), 0, None, config['keyword'], None) info_wars_get(get_links) elif str('scholar') in config['search_engines']: get_links = (str('scholar'), 0, None, config['keyword'], None) search_scholar(get_links) elif str('scholarpedia') in config['search_engines']: get_links = (str('scholar'), 0, None, config['keyword'], None) scholar_pedia_get(get_links) else: search = scrape_with_config(config) links = [] for serp in search.serps: print(serp) links.extend([link.link for link in serp.links]) # This code block jumps over gate two # The (possibly private, or hosted server as a gatekeeper). if len(links) > self.NUM_LINKS: links = links[0:self.NUM_LINKS] if len(links) > 0: print(links) buffer = None se_ = config['search_engines'] category = config['keyword'] get_links = ((se_, index, link, category, buffer) for index, link in enumerate(links)) for gl in get_links: process(gl) # map over the function in parallel since it's 2018 #b = db.from_sequence(get_links,npartitions=8) #_ = list(b.map(process).compute()) except GoogleSearchError as e: print(e) return None print('done scraping')
def fetch_image_urls(self, keyword, num_urls): self.gscraper_config['SCRAPING']['keyword'] = keyword self.gscraper_config['SCRAPING']['num_pages_for_keyword'] =\ self.get_num_pages_from_num_urls(num_urls) try: search = scrape_with_config(self.gscraper_config) except GoogleSearchError as e: logging.info(e) search = '' return image_urls = list() for serp in search.serps: image_urls.extend([link.link for link in serp.links]) if num_urls > len(image_urls): return image_urls else: return image_urls[:num_urls]
def run_job(tilte, url): query = tilte config = { 'use_own_ip': True, 'keyword': query, 'search_engines': ['Google'], # 'num_results_per_page': 10, # this is ignored by bing, 10 results per page 'num_pages_for_keyword': 100, 'scrape_method': 'selenium', 'num_workers': 4, # 'scrape_method': 'http', 'sel_browser': 'chrome', # 'do_sleep': False, # 'browser_mode': 'normal', 'browser_mode': 'headless', # 'chromedriver_path': '/Users/johnny/Downloads/chromedriver', 'chromedriver_path': '/app/chromeDriver/chromedriver', 'do_caching': False, # 'print_results': 'summarize', 'google_search_url': url, } search = scrape_with_config(config) result = [] print(search.serps) for serp in search.serps: for link in serp.links: if link.snippet and link.visible_link: title = link.snippet.replace("\n", "") link = link.visible_link if len(title) > 50: title = f"{title[:30]}..." if 'https' not in link[:5]: link = f'http://{link}' result.append({'title': title, 'link': link}) print(title) print(link) print("-------") return result
def main(arg_list): # Get the arguments path = arg_list[0] keyword = arg_list[1] # Create our target directory if it doesn't exist if not os.path.exists(path): os.mkdir(path) # Create our configuration file config = ConfigFactory.create_config(keyword) try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError: print('Error!') image_urls = [] search = sqlalchemy_session.query(ScraperSearch).all()[-1] for serp in search.serps: image_urls.extend([link.link for link in serp.links]) print('[i] Going to scrape {num} images and saving them in "{dir}"'.format( num=len(image_urls), dir=path )) thread_count = 100 threads = [FetchResource(path, []) for i in range(thread_count)] while image_urls: for thread in threads: try: thread.urls.append(image_urls.pop()) except IndexError: break threads = [thread for thread in threads if thread.urls] for thread in threads: thread.start() for thread in threads: thread.join()
def search(data): global CONFIG CONFIG['keyword'] = data try: result = scrape_with_config(CONFIG) except (GoogleSearchError, SocketError) as _: return 'Not found' buf = [] for serp in result.serps: for link in serp.links: if link.snippet is not None: buf.append(link.snippet.strip()+os.linesep) return os.linesep.join(buf)
def scraper(): os.remove("google_scraper.db") config = { 'use_own_ip': True, 'keyword_file': 'sent.txt', # 'bing_search_url' : 'http://www.bing.com/?mkt=zh-CN', 'search_engines': ['google'], # 'search_engines': ['bing'], 'num_pages_for_keyword': 1, 'scrape_method': 'selenium', # 'scrape_method': 'http-async', #'sel_browser': 'firefox', # uncomment one when using selenium mode 'sel_browser': 'chrome', 'do_caching': False, 'clean_cache_files': False, 'print_results': 'summarize', # 'output_filename': 'out.csv', # added for async mode # 'google_sleeping_ranges' : { \ # 1: (2, 3), \ # 5: (3, 5), \ # 30: (10, 20), \ # 127: (30, 50), \ # } } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) num_result = [] for serp in search.serps: # print("-------------------------------------------------------") if not serp.effective_query: # if serp.no_results == False: # not work for bing num_result.append(serp.num_results_for_query) else: num_result.append("0") # no result # print(serp.num_results_for_query) # print(serp.effective_query) # print(serp.no_results) return num_result
def getUrls(keyword): print(keyword) config = { 'use_own_ip': 'False', 'keyword': keyword + " site:en.wikipedia.org", 'search_engines': ['bing', ], 'num_pages_for_keyword': 1, 'scrape_method': 'http', 'do_caching': 'False', } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) return search.serps
def related_search(): target_directory = 'related/' # See in the config.cfg file for possible values config = { 'keyword': 'web siling', # :D hehe have fun my dear friends 'search_engines': 'yahoo', # duckduckgo not supported 'search_type': 'related', 'scrapemethod': 'selenium' } try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print(">>>", e) for search in sqlalchemy_session.query(ScraperSearch).all(): for serp in search.serps: # print(serp, dir(serp)) for keyword in serp.keywords: print(keyword)
def scrap(self, keyword): keywords = [keyword] # See in the config.cfg file for possible values config = { 'use_own_ip': 'False', 'keywords': keywords, 'search_engines': ['google'], 'num_pages_for_keyword': 2, 'scrape_method': 'http', # selenium # 'sel_browser': 'chrome', uncomment if scrape_method is selenium # 'executable_path': 'path\to\chromedriver' or 'path\to\phantomjs', 'do_caching': 'True', 'cachedir': '/tmp/.scrapecache/', 'database_name': '/tmp/google_scraper', 'clean_cache_after': 24, 'output_filename': None, 'print_results': 'all', } try: return scrape_with_config(config) except GoogleSearchError: print(traceback.print_exc())
def scrape_query(self, mode, search_engines='*', query='', random_query=False, sel_browser='Chrome'): if random_query: query = random_word() config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword': query, 'search_engines': search_engines, 'num_pages_for_keyword': 1, 'scrape_method': mode, }, 'GLOBAL': { 'do_caching': 'False', 'verbosity': 0 }, 'SELENIUM': { 'sel_browser': sel_browser } } search = scrape_with_config(config) if search_engines == '*': assert search.number_search_engines_used == len(all_search_engines) else: assert search.number_search_engines_used == len(search_engines.split(',')) if search_engines == '*': assert len(search.used_search_engines.split(',')) == len(all_search_engines) else: assert len(search.used_search_engines.split(',')) == len(search_engines.split(',')) assert search.number_proxies_used == 1 assert search.number_search_queries == 1 assert search.started_searching < search.stopped_searching return search
def fetch_info(self, keyword): self.gscraper_config['SCRAPING']['keyword'] = keyword info = defaultdict(dict) info['num_results_for_query']['baidu'] = 0 info['num_results_for_query']['google'] = 0 for i in range(0, RETRY): try: search = scrape_with_config(self.gscraper_config) except GoogleSearchError as e: logging.info(e) search = '' return for serp in search.serps: text = serp.num_results_for_query if 'baidu' in serp.search_engine_name: info['num_results_for_query']['baidu'] = int(cogtu_misc.get_first_number_from_text(text)) elif 'google' in serp.search_engine_name: info['num_results_for_query']['google'] = int(cogtu_misc.get_first_number_from_text(text)) if info['num_results_for_query']['baidu'] is not 0 or\ info['num_results_for_query']['google'] is not 0: break logging.info('RETRYING...') return info
def run(self): n = self.param['n'] * 10 # multiplier is set to collect far more results than required for establishing suggestions MAX_PER_PAGE = 100 # this is a limit imposed by GoogleScraper config = { 'use_own_ip': True, 'keyword': self.param['keywords'], 'search_engines': ['google'], 'scrape_method': 'http', 'do_caching': False, 'log_level': self.verbose, 'num_pages_for_keyword': len(range(0, n, MAX_PER_PAGE)) if n > 0 else 1, 'num_results_per_page': min(n, MAX_PER_PAGE) if n > 0 else MAX_PER_PAGE, } if 'PROXY_FILE' in self.config and self.config['PROXY_FILE'] not in [None, '']: config.update({'proxy_file': self.config['PROXY_FILE'], 'check_proxies': False}) # NB: check_proxies is a parameter aimed to make the (public) proxy address checked on a website, # so if using a private network proxy, this check is not required # scrape on keywords and get a connection to the cache database search = scrape_with_config(config) # check the status and raise an exception if scraping failed for serp in search.serps: if serp.status != 'successful' and serp.no_results: self.logger.error(serp.status) exit(1) # collect found links links, suggestions = [link for serp in search.serps for link in serp.links], [] k, l = 0, len(links) while len(suggestions) < self.param['n'] and k < l: # TODO: write a filter # e.g. for: # - favouring links with domain containing one or more of the keywords) # - excluding links on specific forums and/or download sites # - exluding maliious domains acording to Norton Safe Web or other security sources (e.g. Webputation) suggestions.append({'link': links[k].link, 'title': links[k].title, 'text': links[k].snippet}) k += 1 return suggestions
def run_crawler(searchString, jsonFileName): # See in the config.cfg file for possible values config = { 'keyword': searchString, 'search_engines':['google', 'bing'], 'num_pages_for_keyword': 10, 'output_filename': jsonFileName, 'SCRAPING': { 'use_own_ip': 'True', 'num_pages_for_keyword': 1 }, 'SELENIUM': { 'sel_browser': 'chrome', }, 'GLOBAL': { 'do_caching': 'False' } } try: sqlalchemy_session = scrape_with_config(config) except GoogleSearchError as e: print("GoogleSearchError") print(e)
def test_asynchronous_mode_bing_and_yandex(self): """ Expected results: - around 60 results - 30 results for bing and 30 results for yandex - valid json file with the contents """ results_file = os.path.join(tempfile.gettempdir(), 'async_results.json') if os.path.exists(results_file): os.remove(results_file) config = { 'keyword': 'where is my mind', 'search_engines': ['bing', 'yandex'], 'num_results_per_page': 10, 'num_pages_for_keyword': 3, 'scrape_method': 'http-async', 'output_filename': results_file, 'do_caching': False, } search = scrape_with_config(config) self.assertEqual(search.keyword_file, '') self.assertLess(search.started_searching, search.stopped_searching) self.assertEqual(search.number_proxies_used, 1) self.assertEqual(search.number_search_engines_used, 2) self.assertEqual(search.number_search_queries, 1) self.assertEqual(len(search.serps), 6) # test that we have twice [1,2,3] as page numbers self.assertSetEqual(set([serp.page_number for serp in search.serps]), {1,2,3}) self.assertAlmostEqual(sum([len(serp.links) for serp in search.serps]), 60, delta=10) self.assertAlmostEqual(sum([len(serp.links) for serp in search.serps if serp.search_engine_name == 'yandex']), 30, delta=5) self.assertAlmostEqual(sum([len(serp.links) for serp in search.serps if serp.search_engine_name == 'bing']), 30, delta=5) for serp in search.serps: self.assertEqual(serp.query, 'where is my mind') self.assertEqual(serp.status, 'successful') self.assertIn(serp.search_engine_name.lower(), ('bing', 'yandex')) self.assertEqual(serp.scrape_method, 'http-async') if serp.search_engine_name != 'yandex': self.assertTrue(is_string_and_longer_than(serp.num_results_for_query, 5)) self.assertAlmostEqual(serp.num_results, 10, delta=2) self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query) self.assertEqual(serp.num_results, len(serp.links)) predicate_true_at_least_n_times(lambda v: is_string_and_longer_than(v, 3), serp.links, 7, 'snippet') for link in serp.links: if link.link_type == 'results': self.assertTrue(is_string_and_longer_than(link.title, 3)) self.assertTrue(is_string_and_longer_than(link.link, 10)) self.assertTrue(isinstance(link.rank, int)) # test that the json output is correct self.assertTrue(os.path.isfile(results_file)) with open(results_file, 'rt') as file: obj = json.load(file) # check the same stuff again for the json file for i, page in enumerate(obj): self.assertEqual(page['effective_query'], '') self.assertEqual(page['num_results'], str(len(page['results']))) if page['search_engine_name'].lower() != 'yandex': self.assertTrue(is_string_and_longer_than(page['num_results_for_query'], 5)) self.assertEqual(page['query'], 'where is my mind') self.assertEqual(page['requested_by'], 'localhost') for j, result in enumerate(page['results']): if result['link_type'] == 'results': self.assertTrue(is_string_and_longer_than(result['title'], 3)) self.assertTrue(is_string_and_longer_than(result['snippet'], 3)) self.assertTrue(is_string_and_longer_than(result['link'], 10)) self.assertTrue(isinstance(int(result['rank']), int))
def test_google_with_chrome_and_json_output(self): """ Very common use case: Ensures that we can scrape three continuous sites with Google using chrome in normal mode and save the results to a JSON file. """ results_file = os.path.join(tempfile.gettempdir(), 'results-chrome.json') if os.path.exists(results_file): os.remove(results_file) query = 'Food New York' config = { 'keyword': query, 'search_engines': ['Google'], 'num_results_per_page': 100, 'num_pages_for_keyword': 3, 'scrape_method': 'selenium', 'sel_browser': 'chrome', 'do_sleep': False, 'browser_mode': 'normal', 'chromedriver_path': '/home/nikolai/projects/private/Drivers/chromedriver', 'output_filename': results_file, 'do_caching': False, } search = scrape_with_config(config) self.assertLess(search.started_searching, search.stopped_searching) self.assertEqual(search.number_proxies_used, 1) self.assertEqual(search.number_search_engines_used, 1) self.assertEqual(search.number_search_queries, 1) self.assertEqual(len(search.serps), 3) self.assertEqual(search.serps[0].page_number, 1) self.assertEqual(search.serps[1].page_number, 2) self.assertEqual(search.serps[2].page_number, 3) for serp in search.serps: self.assertEqual(serp.status, 'successful') self.assertEqual(serp.search_engine_name.lower(), 'google') self.assertEqual(serp.scrape_method, 'selenium') self.assertTrue(serp.num_results_for_query) self.assertAlmostEqual(int(serp.num_results), 100, delta=10) self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query) self.assertEqual(serp.no_results, False) self.assertEqual(serp.num_results, len(serp.links)) for j, link in enumerate(serp.links): if link.link_type == 'results': self.assertTrue(is_string_and_longer_than(link.title, 3)) self.assertTrue(is_string_and_longer_than(link.snippet, 3)) self.assertTrue(is_string_and_longer_than(link.link, 10)) self.assertTrue(isinstance(link.rank, int)) # test that the json output is correct self.assertTrue(os.path.isfile(results_file)) with open(results_file, 'rt') as file: obj = json.load(file) # check the same stuff again for the json file for i, page in enumerate(obj): self.assertEqual(page['effective_query'], '') self.assertEqual(page['no_results'], 'False') self.assertEqual(page['num_results'], str(len(page['results']))) self.assertAlmostEqual(int(page['num_results']), 100, delta=10) self.assertTrue(is_string_and_longer_than(page['num_results_for_query'], 5)) self.assertEqual(page['page_number'], str(i+1)) self.assertEqual(page['query'], query) # todo: Test requested_at self.assertEqual(page['requested_by'], 'localhost') for j, result in enumerate(page['results']): if result['link_type'] == 'results': self.assertTrue(is_string_and_longer_than(result['title'], 3)) self.assertTrue(is_string_and_longer_than(result['snippet'], 3)) self.assertTrue(is_string_and_longer_than(result['link'], 10)) self.assertTrue(isinstance(int(result['rank']), int))
def test_http_mode_google_csv_output(self): results_file = os.path.join(tempfile.gettempdir(), 'results.csv') if os.path.exists(results_file): os.remove(results_file) config = { 'keyword': 'banana', 'search_engines': ['Google'], 'num_results_per_page': 10, 'num_pages_for_keyword': 2, 'scrape_method': 'http', 'output_filename': results_file, 'do_caching': False, } search = scrape_with_config(config) self.assertLess(search.started_searching, search.stopped_searching) self.assertEqual(search.number_proxies_used, 1) self.assertEqual(search.number_search_engines_used, 1) self.assertEqual(search.number_search_queries, 1) self.assertEqual(len(search.serps), 2) self.assertEqual(search.serps[0].page_number, 1) self.assertEqual(search.serps[1].page_number, 2) for serp in search.serps: self.assertEqual(serp.query, 'banana') self.assertEqual(serp.status, 'successful') self.assertEqual(serp.search_engine_name.lower(), 'google') self.assertEqual(serp.scrape_method, 'http') self.assertTrue(serp.num_results_for_query) self.assertAlmostEqual(serp.num_results, 10, delta=2) self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query) self.assertEqual(serp.no_results, False) self.assertEqual(serp.num_results, len(serp.links)) predicate_true_at_least_n_times(lambda v: is_string_and_longer_than(v, 3), serp.links, 7, 'snippet') for link in serp.links: if link.link_type == 'results': self.assertTrue(is_string_and_longer_than(link.title, 3)) self.assertTrue(is_string_and_longer_than(link.link, 10)) self.assertTrue(isinstance(link.rank, int)) # test that the csv output is correct self.assertTrue(os.path.isfile(results_file)) with open(results_file, 'rt') as file: reader = csv.DictReader(file, delimiter=',') rows = [row for row in reader] self.assertAlmostEqual(20, len(rows), delta=3) for row in rows: self.assertEqual(row['query'], 'banana') self.assertTrue(is_string_and_longer_than(row['requested_at'], 5)) self.assertTrue(int(row['num_results'])) self.assertEqual(row['scrape_method'], 'http') self.assertEqual(row['requested_by'], 'localhost') self.assertEqual(row['search_engine_name'], 'google') self.assertIn(int(row['page_number']), [1,2]) self.assertEqual(row['status'], 'successful') self.assertTrue(row['no_results'] == 'False') self.assertTrue(row['effective_query'] == '') if row['link_type'] == 'results': self.assertTrue(is_string_and_longer_than(row['title'], 3)) self.assertTrue(is_string_and_longer_than(row['snippet'], 3)) self.assertTrue(is_string_and_longer_than(row['domain'], 5)) self.assertTrue(is_string_and_longer_than(row['visible_link'], 5)) self.assertTrue(is_string_and_longer_than(row['num_results_for_query'], 3)) self.assertTrue(is_string_and_longer_than(row['link'], 10)) self.assertTrue(row['rank'].isdigit()) # ensure that at least 90% of all entries have a string as snippet predicate_true_at_least_n_times(lambda v: is_string_and_longer_than(v, 3), rows, int(0.8*len(rows)), 'snippet')
def image_search(query, engines, pages): target_directory = 'images/' # See in the config.cfg file for possible values config = { 'use_own_ip': True, 'keyword': query, 'search_engines': [engines], 'search_type': 'image', 'num_pages_for_keyword': pages, 'scrape_method': 'http', 'do_caching': False } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) image_urls = [] search = search.query(ScraperSearch).all()[-1] for serp in search.serps: image_urls.extend( [link.link for link in serp.links] ) print('[i] Going to scrape {num} images and saving them in "{dir}"'.format( num=len(image_urls), dir=target_directory )) import threading,requests, os, urllib class FetchResource(threading.Thread): """Grabs a web resource and stores it in the target directory""" def __init__(self, target, urls): super().__init__() self.target = target self.urls = urls def run(self): for url in self.urls: url = urllib.parse.unquote(url) with open(os.path.join(self.target, url.split('/')[-1]), 'wb') as f: try: content = requests.get(url).content f.write(content) except Exception as e: pass print('[+] Fetched {}'.format(url)) # make a directory for the results try: os.mkdir(target_directory) except FileExistsError: pass # fire up 100 threads to get the images num_threads = 100 threads = [FetchResource('images/', []) for i in range(num_threads)] while image_urls: for t in threads: try: t.urls.append(image_urls.pop()) except IndexError as e: break threads = [t for t in threads if t.urls] for t in threads: t.start() for t in threads: t.join()
""" from GoogleScraper import scrape_with_config, GoogleSearchError if __name__ == '__main__': # See in the config.cfg file for possible values config = { 'SCRAPING': { 'use_own_ip': 'True', 'keyword': 'Hello World' }, 'SELENIUM': { 'sel_browser': 'chrome', 'manual_captcha_solving': 'True' }, 'GLOBAL': { 'do_caching': 'True' } } try: # scrape() and scrape_with_config() will reuturn a handle to a sqlite database with the results db = scrape_with_config(config) print(db.execute('SELECT * FROM link').fetchall()) except GoogleSearchError as e: print(e)
['"{}"'.format(s) for s in sentence.split(',') if len(s) > 25] ) return chunks # write the chunks to a file with open('chunks.txt', 'wt') as f: for chunk in make_chunks(text): f.write(chunk + '\n') # # See in the config.cfg file for possible values config = { 'use_own_ip': True, 'keyword_file': 'chunks.txt', 'search_engines': ['google'], 'num_pages_for_keyword': 1, 'scrape_method': 'selenium', 'sel_browser': 'chrome', } try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) for serp in search.serps: # if the original query yielded some results and thus was found by google. if not serp.effective_query: print('Found plagiarized content: "{}"'.format(serp.query))