def test_csv_file_header_always_the_same(self):
        """
        Check that csv files have always the same order in their header.
        """
        csv_outfile_1 = os.path.join(base, 'data/tmp/csvout1.csv')
        csv_outfile_2 = os.path.join(base, 'data/tmp/csvout2.csv')

        config = {
            'keyword': 'some words',
            'search_engines': all_search_engines,
            'num_pages_for_keyword': 2,
            'scrape_method': 'selenium',
            'cachedir': os.path.join(base, 'data/csv_tests/'),
            'do_caching': True,
            'verbosity': 0,
            'output_filename': csv_outfile_1,
        }
        search = scrape_with_config(config)

        search = scrape_with_config(config)
        config.update({'output_filename': csv_outfile_2})
        search = scrape_with_config(config)

        assert os.path.isfile(csv_outfile_1) and os.path.isfile(csv_outfile_2)

        file1 = open(csv_outfile_1, 'rt')
        file2 = open(csv_outfile_2, 'rt')

        import csv
        reader1, reader2 = csv.DictReader(file1), csv.DictReader(file2)

        header1, header2 = reader1.fieldnames, reader2.fieldnames
        from GoogleScraper.output_converter import csv_fieldnames

        assert header1 == header2 == csv_fieldnames
示例#2
0
    def test_csv_file_header_always_the_same(self):
        """
        Check that csv files have always the same order in their header.
        """
        csv_outfile_1 = os.path.join(base, 'data/tmp/csvout1.csv')
        csv_outfile_2 = os.path.join(base, 'data/tmp/csvout2.csv')

        config = {
            'keyword': 'some words',
            'search_engines': all_search_engines,
            'num_pages_for_keyword': 2,
            'scrape_method': 'selenium',
            'cachedir': os.path.join(base, 'data/csv_tests/'),
            'do_caching': True,
            'verbosity': 0,
            'output_filename': csv_outfile_1,
        }
        search = scrape_with_config(config)

        search = scrape_with_config(config)
        config.update({'output_filename': csv_outfile_2})
        search = scrape_with_config(config)

        assert os.path.isfile(csv_outfile_1) and os.path.isfile(csv_outfile_2)

        file1 = open(csv_outfile_1, 'rt')
        file2 = open(csv_outfile_2, 'rt')

        import csv
        reader1, reader2 = csv.DictReader(file1), csv.DictReader(file2)

        header1, header2 = reader1.fieldnames, reader2.fieldnames
        from GoogleScraper.output_converter import csv_fieldnames

        assert header1 == header2 == csv_fieldnames
示例#3
0
    def test_csv_output_static(self):
        """Test csv output.

        Test parsing 4 html pages with two queries and two pages per query and
        transforming the results to csv format.

        The cached file should be saved in 'data/csv_tests/', there should
        be as many files as search_engine * pages_for_keyword

        The keyword used in the static SERP pages MUST be 'some words'

        The filenames must be in the GoogleScraper cache format.
        """

        import csv
        from GoogleScraper.output_converter import csv_fieldnames

        number_search_engines = len(all_search_engines)
        csv_outfile = os.path.join(base, 'data/tmp/csv_test.csv')

        config = {
            'keyword': 'some words',
            'search_engines': all_search_engines,
            'num_pages_for_keyword': 2,
            'scrape_method': 'selenium',
            'cachedir': os.path.join(base, 'data/csv_tests/'),
            'do_caching': True,
            'verbosity': 0,
            'output_filename': csv_outfile,
        }
        scrape_with_config(config)

        assert os.path.exists(csv_outfile), '{} does not exist'.format(
            csv_outfile)

        reader = csv.reader(open(csv_outfile, 'rt'))

        # the items that should always have a value:
        notnull = ('link', 'query', 'rank', 'domain', 'title', 'link_type',
                   'scrape_method', 'page_number', 'search_engine_name',
                   'snippet')

        for rownum, row in enumerate(reader):
            if rownum == 0:
                header = row
                header_keys = set(row)
                assert header_keys.issubset(set(
                    csv_fieldnames)), 'Invalid CSV header: {}'.format(header)

            for item in notnull:
                assert row[header.index(
                    item)], '{} has a item that has no value: {}'.format(
                        item, row)

        self.assertAlmostEqual(number_search_engines * 2 * 10,
                               rownum,
                               delta=30)
示例#4
0
    def test_json_output_static(self):
        """Test json output.

        """

        import json

        number_search_engines = len(all_search_engines)
        json_outfile = os.path.join(base, 'data/tmp/json_test.json')

        config = {
            'keyword': 'some words',
            'search_engines': all_search_engines,
            'num_pages_for_keyword': 2,
            'scrape_method': 'selenium',
            'cachedir': os.path.join(base, 'data/json_tests/'),
            'do_caching': True,
            'verbosity': 0,
            'output_filename': json_outfile
        }
        scrape_with_config(config)

        assert os.path.exists(json_outfile), '{} does not exist'.format(
            json_outfile)

        file = open(json_outfile, 'r')
        try:
            results = json.load(file)
        except ValueError as e:
            print('Cannot parse output json file {}. Reason: {}'.format(
                json_outfile, e))
            raise e

        # the items that should always have a value:
        notnull = ('link', 'rank', 'domain', 'title', 'link_type')
        num_results = 0
        for item in results:

            for k, v in item.items():

                if k == 'results':

                    for res in v:
                        num_results += 1

                        for item in notnull:
                            assert res[
                                item], '{} has a item that has no value: {}'.format(
                                    item, res)

        self.assertAlmostEqual(number_search_engines * 2 * 10,
                               num_results,
                               delta=30)
def insertNametoConfig_Search(name, config):
    #config['keyword'] = name + ' "Saas" rds.fightmetric.com'
    config['keyword'] = name + ' site:www.fightmetric.com'
    config['output_filename'] = name + ".json"
    print("PRINTING WITH PHRASE: " + config['keyword'])
    try:
        search = scrape_with_config(config)
        data = readinJsonSearch(name)
        print(str(data))
        url_json = {}
        for res in data[0]['results']:
            #matches last name only...but search is for full name....so should be right 99.9%...?
            g = re.search(r'(.+)\s(.+)', name)
            n1 = str(g.group(2))
            if n1 in res['title']:
                url_json = {'name': name, 'url': str(res['link'])}
                os.remove(name + '.json')
                with open(name + '.json', 'w') as outfile:
                    json.dump(url_json, outfile)
                data = readinJsonSearch(name)

            #man, no fightmetric on "num_results": "0",


##  "query": "Khalid Murtazaliev  site:www.fightmetric.com",
##  "requested_at": "2018-09-12 01:21:22.325246",

    except GoogleSearchError as e:
        print(e)
    try:
        if data["num_results"] != "0":
            print("returning none")
            return None
    except:
        return data
    def test_no_results_serp_object(self):

        config = {
            'SCRAPING': {
                'keyword': 'asdfasdfa7654567654345654343sdfasd',
                'search_engines': '*', # all available search engines
                'num_pages_for_keyword': 1,
                'scrape_method': 'selenium'
            },
            'GLOBAL': {
                'cachedir': 'data/no_results/',
                'do_caching': 'True',
                'verbosity': 1
            }
        }
        search = scrape_with_config(config)

        assert search.number_search_engines_used == len(all_search_engines)
        assert len(search.used_search_engines.split(',')) == len(search.used_search_engines.split(','))
        assert search.number_proxies_used == 1
        assert search.number_search_queries == 1
        assert search.started_searching < search.stopped_searching

        assert len(all_search_engines) == len(search.serps), 'Not enough results. Expected: {}, got {}'.format(len(all_search_engines), len(search.serps))

        for serp in search.serps:
            assert serp.has_no_results_for_query(), 'num_results must be 0 but is {}. {}'.format(serp.num_results, serp.links)

            # some search engine do alternative searches instead of yielding
            # nothing at all.

            if serp.search_engine_name in ('google', 'bing'):
                assert serp.effective_query, '{} must have an effective query when a keyword has no results.'.format(serp.search_engine_name)
示例#7
0
def basic_usage():
    # See in the config.cfg file for possible values
    config = {
        'SCRAPING': {
            'use_own_ip': 'True',
            'keyword': 'Let\'s go bubbles!',
            'search_engines': 'yandex',
            'num_pages_for_keyword': 1
        },
        'SELENIUM': {
            'sel_browser': 'chrome',
        },
        'GLOBAL': {
            'do_caching': 'False'
        }
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    # let's inspect what we got

    for search in sqlalchemy_session.query(ScraperSearch).all():
        for serp in search.serps:
            print(serp)
            for link in serp.links:
                print(link)
示例#8
0
def fetchImages(query):
    config = {
        'keyword': query,
        'search_engines': ['yandex'],
        'search_type': 'image',
        'scrape_method': 'selenium',
        'do_caching': True,
        'log_level': 'CRITICAL',
        'print_results': 'summary',
        'output_format': ''
    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    image_urls = []

    for serp in search.serps:
        image_urls.extend(
            [link.link for link in serp.links]
        )
    max_num_of_images = 2
    images = []
    i = 0
    for image_url in image_urls:
        images.append(unquote(image_url))
        i += 1
        if i > max_num_of_images:
            break
    return images
    def test_no_results_serp_object(self):

        config = {
            "keyword": "asdfasdfa7654567654345654343sdfasd",
            "search_engines": all_search_engines,
            "num_pages_for_keyword": 1,
            "scrape_method": "selenium",
            "cachedir": os.path.join(base, "data/no_results/"),
            "do_caching": True,
            "verbosity": 1,
        }
        search = scrape_with_config(config)

        assert search.number_search_engines_used == len(all_search_engines)
        assert len(search.used_search_engines.split(",")) == len(search.used_search_engines.split(","))
        assert search.number_proxies_used == 1
        assert search.number_search_queries == 1
        assert search.started_searching < search.stopped_searching

        assert len(all_search_engines) == len(search.serps), "Not enough results. Expected: {}, got {}".format(
            len(all_search_engines), len(search.serps)
        )

        for serp in search.serps:
            assert serp.has_no_results_for_query(), "num_results must be 0 but is {}. {}".format(
                serp.num_results, serp.links
            )

            # some search engine do alternative searches instead of yielding
            # nothing at all.

            if serp.search_engine_name in ("google", "bing"):
                assert serp.effective_query, "{} must have an effective query when a keyword has no results.".format(
                    serp.search_engine_name
                )
示例#10
0
def basic_usage():
    # See in the config.cfg file for possible values
    generate_sub_queries(300)

    config = {
        'SCRAPING': {
            'use_own_ip': 'True',
            'keyword_file': "queries.txt",
            'search_engines': 'bing,baidu,yandex',
            'num_workers': 8,
            'num_pages_for_keyword': 20,
            'scrape_method': 'http',
        },
        'SELENIUM': {
            'sel_browser': 'chrome',
            'num_workers': 4,
        },
        'GLOBAL': {
            'do_caching': 'True'
        },
        'OUTPUT': {
            'output_filename': 'out.txt',
        }
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)
示例#11
0
def get_image_search_result(query):
    print('Searching image [{}]'.format(query))
    global config
    config['keyword'] = query
    config['search_type'] = 'image'
    config['search_engines'] = 'yandex'

    try:
        search = scrape_with_config(config)
        serp = search.serps[0]
        top_result = random.choice(serp.links)
        url = top_result.link
        url = urllib.parse.unquote(url)
        content = requests.get(url).content
    except Exception as e:
        print(e)
        content = '没搜到啊'
    try:
        with open('tmp.jpg', 'wb') as f:
            f.write(content)
        info = convert_image('tmp.jpg', 'tmp.jpg')
    except:
        info = ''
    #return 'tmp.jpg'
    return '{} {}'.format(url, info)  # for qq temporarily
示例#12
0
        def analizar_keyword(keyword):
            # Configuracion de GoogleScrap
            pdb.set_trace()
            config = {
                'SCRAPING': {
                    'use_own_ip': 'True',
                    'keyword': keyword,
                    'search_engines': 'bing',
                    'num_pages_for_keyword': 3
                },
                'SELENIUM': {
                    'sel_browser': 'chrome',
                },
                'GLOBAL': {
                    'do_caching': 'False'
                }
            }

            try:
                pdb.set_trace()
                sqlalchemy_session = scrape_with_config(config)
            except GoogleSearchError as e:
                print(e)

            # Inspeccion
            pdb.set_trace()
            for search in sqlalchemy_session.query(ScraperSearch).all():
                for serp in search.serps:
                    print(serp)
                    for link in serp.links:
                        print(link)
示例#13
0
def getTopResultsFromGoogle(word):
    keywords = []
    keywords.append(word)

    config = {
        'use_own_ip': 'True',
        'keywords': keywords,
        'search_engines': [
            'google',
        ],
        'num_pages_for_keyword': 2,
        'scrape_method': 'http',
        'do_caching': 'False'
    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    results = []
    count = 0

    if search.serps[0].page_number == 2:
        search.serps.reverse()
    for serp in search.serps:
        print(serp)
        for link in serp.links:
            if (count == 10):
                break
            results.append(link.link)
            count += 1

    return results
示例#14
0
    def test_no_results_serp_object(self):

        config = {
            'keyword': 'asdfasdfa7654567654345654343sdfasd',
            'search_engines': all_search_engines,
            'num_pages_for_keyword': 1,
            'scrape_method': 'selenium',
            'cachedir': os.path.join(base, 'data/no_results/'),
            'do_caching': True,
            'verbosity': 1,
        }
        search = scrape_with_config(config)

        assert search.number_search_engines_used == len(all_search_engines)
        assert len(search.used_search_engines.split(',')) == len(
            search.used_search_engines.split(','))
        assert search.number_proxies_used == 1
        assert search.number_search_queries == 1
        assert search.started_searching < search.stopped_searching

        assert len(all_search_engines) == len(
            search.serps), 'Not enough results. Expected: {}, got {}'.format(
                len(all_search_engines), len(search.serps))

        for serp in search.serps:
            assert serp.has_no_results_for_query(
            ), 'num_results must be 0 but is {}. {}'.format(
                serp.num_results, serp.links)

            # some search engine do alternative searches instead of yielding
            # nothing at all.

            if serp.search_engine_name in ('google', 'bing'):
                assert serp.effective_query, '{} must have an effective query when a keyword has no results.'.format(
                    serp.search_engine_name)
示例#15
0
        def analizar_keyword(keyword):
            # Configuracion de GoogleScrap
            pdb.set_trace()
            config = {
                'SCRAPING': {
                    'use_own_ip': 'True',
                    'keyword': keyword,
                    'search_engines': 'bing',
                    'num_pages_for_keyword': 3
                },
                'SELENIUM': {
                    'sel_browser': 'chrome',
                },
                'GLOBAL': {
                    'do_caching': 'False'
                }
            }

            try:
                pdb.set_trace()
                sqlalchemy_session = scrape_with_config(config)
            except GoogleSearchError as e:
                print(e)

            # Inspeccion
            pdb.set_trace()
            for search in sqlalchemy_session.query(ScraperSearch).all():
                for serp in search.serps:
                    print(serp)
                    for link in serp.links:
                        print(link)
    def test_csv_output_static(self):
        """Test csv output.

        Test parsing 4 html pages with two queries and two pages per query and
        transforming the results to csv format.

        The cached file should be saved in 'data/csv_tests/', there should
        be as many files as search_engine * pages_for_keyword

        The keyword used in the static SERP pages MUST be 'some words'

        The filenames must be in the GoogleScraper cache format.
        """

        import csv
        from GoogleScraper.output_converter import csv_fieldnames

        number_search_engines = len(all_search_engines)
        csv_outfile = os.path.join(base, "data/tmp/csv_test.csv")

        config = {
            "keyword": "some words",
            "search_engines": all_search_engines,
            "num_pages_for_keyword": 2,
            "scrape_method": "selenium",
            "cachedir": os.path.join(base, "data/csv_tests/"),
            "do_caching": True,
            "verbosity": 0,
            "output_filename": csv_outfile,
        }
        search = scrape_with_config(config)

        assert os.path.exists(csv_outfile), "{} does not exist".format(csv_outfile)

        reader = csv.reader(open(csv_outfile, "rt"))

        # the items that should always have a value:
        notnull = (
            "link",
            "query",
            "rank",
            "domain",
            "title",
            "link_type",
            "scrape_method",
            "page_number",
            "search_engine_name",
            "snippet",
        )

        for rownum, row in enumerate(reader):
            if rownum == 0:
                header = row
                header_keys = set(row)
                assert header_keys.issubset(set(csv_fieldnames)), "Invalid CSV header: {}".format(header)

            for item in notnull:
                assert row[header.index(item)], "{} has a item that has no value: {}".format(item, row)

        self.assertAlmostEqual(number_search_engines * 2 * 10, rownum, delta=30)
示例#17
0
def basic_usage():
    # See in the config.cfg file for possible values
    config = {
        'SCRAPING': {
            'use_own_ip': 'True',
            'keyword': 'Let\'s go bubbles!',
            'search_engines': 'yandex',
            'num_pages_for_keyword': 1
        },
        'SELENIUM': {
            'sel_browser': 'chrome',
        },
        'GLOBAL': {
            'do_caching': 'False'
        }
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    # let's inspect what we got

    for search in sqlalchemy_session.query(ScraperSearch).all():
        for serp in search.serps:
            print(serp)
            for link in serp.links:
                print(link)
示例#18
0
def saveLink(query):
    # See in the config.cfg file for possible values
    try:
        if query:
            file_name = query.replace(" ", "_")
            self.config = {
                'SCRAPING': {
                    'use_own_ip': 'True',
                    'keyword': query,
                    'search_engines': 'bing',
                    'num_pages_for_keyword': 1,
                    'scrape_method': 'http'
                },
                'SELENIUM': {
                    'sel_browser': 'chrome',
                },
                'OUTPUT': {
                    'output_filename': "path/" + file_name + ".json"
                },
                'GLOBAL': {
                    'do_caching': 'False'
                }
            }

            raw_html = ""
            sqlalchemy_session = scrape_with_config(self.config)
    except Exception:
        import traceback
        print(traceback.format_exc())
示例#19
0
    def ScrapLinksFromBrowser(self):
        # See in the config.cfg file for possible values
        global config
        config = {
            'use_own_ip':
            True,
            'keyword':
            'security brigade',
            'search_engines':
            ['Google', 'Bing', 'Yahoo', 'Yandex', 'Baidu', 'Duckduckgo'],
            'num_pages_for_keyword':
            2,
            'scrape_method':
            'selenium',
            'sel_browser':
            'chrome',
        }
        try:
            search = scrape_with_config(config)
        except GoogleSearchError as e:
            print(e)

        # let's inspect what we got
        for serp in search.serps:
            # print(serp)
            # print(serp.search_engine_name)
            # print(serp.scrape_method)
            # print(serp.page_number)
            # print(serp.requested_at)
            # print(serp.num_results)
            # ... more attributes ...
            for link in serp.links:
                self.listoflink.append(link)
示例#20
0
文件: project1.py 项目: tudorgk/WS
def basic_usage():
    # See in the config.cfg file for possible values
    generate_sub_queries(300)

    json_outfile = 'data/tmp/json_test.json'

    config = {
        'SCRAPING': {
            'use_own_ip': 'True',
            'keyword_file': "queries.txt",
            'search_engines': 'bing,baidu',
            'num_pages_for_keyword': 10,
            'scrape_method': 'http',
            'num_workers': 8,
        },
        'GLOBAL': {
            'cachedir': 'data/json_tests/',
            'do_caching': 'True',
            'verbosity': 0
        },
        'OUTPUT': {
            'output_filename': json_outfile
        }
    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)
示例#21
0
def basic_search(query, engines, pages):
    # See in the config.cfg file for possible values
    config = {
        'use_own_ip': True,
        'keyword': query,
        'search_engines': [engines],
        'num_pages_for_keyword': pages,
        'scrape_method': 'http',
        'loglevel': 'WARN',
        'print_results': 'summarize',
        'do_caching': False
    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    # let's inspect what we got
    '''
    for serp in search.serps:
        #print(serp)
        print(serp.status)
        print(serp.scrape_method)
        print(serp.page_number)
        print(serp.requested_at)
        print(serp.num_results)
        for link in serp.links:
            print(link)
    '''
    return search
示例#22
0
    def image_search(key_phrase,
                     threads_count,
                     pages_count,
                     target_directory,
                     search_engines=None):
        if not search_engines:
            search_engines = ['google', 'baidu', 'yandex', 'bing', 'yahoo']

        config = {
            'keywords': [key_phrase],
            'search_engines': search_engines,
            'search_type': 'image',
            'scrape_method': 'selenium',
            'do_caching': False,
            'num_pages_for_keyword': str(pages_count)
        }

        try:
            search = scrape_with_config(config)
        except GoogleSearchError as e:
            print(e)

        image_urls = []
        print("\t\t\t --- ", len(image_urls))

        for serp in search.serps:
            image_urls.extend([link.link for link in serp.links])

        print('[i] Going to scrape {num} images and saving them in "{dir}"'.
              format(num=len(image_urls), dir=target_directory))

        try:
            os.mkdir(target_directory)
        except FileExistsError:
            pass

        # fire up 100 threads to get the images - threads_count

        threads = [
            FetchResource(target_directory, []) for i in range(threads_count)
        ]

        while image_urls:
            for t in threads:
                try:
                    t.urls.append(image_urls.pop())
                except IndexError as e:
                    break

        threads = [t for t in threads if t.urls]

        for t in threads:
            t.start()

        for t in threads:
            t.join()

        return True
示例#23
0
def crawl_data(keyword):
    file_num = 0
    output_filename = './crawling_output/output_{}.csv'.format(file_num)
    params = {
        'keyword': keyword + ' site:www.quora.com',
        'num_pages': 2,
        'filename': output_filename,
    }

    config = get_config(**params)
    title_list = []
    title_origin_list = []
    similarity_list = []
    link_list = []
    dict_idx = 0
    output_dict = {}

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)
    else:
        # 검색 결과를 확인하는 함수
        # test_google_search(search)

        # open scv file
        with open(output_filename, 'r', newline='') as csv_file:
            # csv_reader = csv.reader(csv_file, delimiter=',')
            csv_reader = csv.DictReader(csv_file, delimiter=',')

            for row in csv_reader:
                title_origin = row['title']
                title = row['title']
                link = row['link']

                # title 에서 부제 제거
                # 'title - src site'와 같이 - or | 있으면 자르기
                title = preprocess_title(title)

                # dictionary element 만들어서 추가
                dict_element = {
                    'title': title,
                    'title_origin': title_origin,
                    'similarity': 0.0,
                    'link': link,
                }
                output_dict[dict_idx] = dict_element

                title_list.append(title)
                title_origin_list.append(title_origin)
                link_list.append(row['link'])

                dict_idx += 1

                # 없으면 문장 그대로
            csv_file.close()

    return title_list, link_list
示例#24
0
def image_search():
    # See in the config.cfg file for possible values
    config = {
        'SCRAPING': {
            'keyword': 'snow nature',
            'search_engines': 'yandex,google,bing,duckduckgo,yahoo,baidu',
            'search_type': 'image',
            'scrapemethod': 'http'
        }
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    image_urls = []
    search = sqlalchemy_session.query(ScraperSearch).all()[-1]

    for serp in search.serps:
        image_urls.extend(
            [link.link for link in serp.links]
        )

    import threading,requests, os

    class FetchResource(threading.Thread):
        """Grabs a web resource and stores it in the target directory"""
        def __init__(self, target, urls):
            super().__init__()
            self.target = target
            self.urls = urls

        def run(self):
            for url in self.urls:
                with open(os.path.join(self.target, url.split('/')[-1]), 'wb') as f:
                    f.write(requests.get(url).content)


    # make a directory for the results
    os.mkdir('images')

    # fire up 100 threads to get the images
    num_threads = 100

    threads = [FetchResource('images/', []) for i in range(num_threads)]

    while image_urls:
        for t in threads:
            t.urls.append(image_urls.pop())

    threads = [t for t in threads if t.urls]

    for t in threads:
        t.start()

    for t in threads:
        t.stop()
示例#25
0
def fetchBlogUrls(city_a, city_b):
    print('\n\nSleeping for {} sec...'.format(sleepTime))
    time.sleep(sleepTime)
    query = 'places to visit between ' + city_a + ' and ' + city_b + ' blogs'
    print('fetching : ' + query + '...')

    config = {
        'use_own_ip': True,
        'keyword': query,
        'search_engines': ['bing'],
        'num_pages_for_keyword': 2,
        'scrape_method': 'selenium',
        'sel_browser': 'chrome',
        'do_caching': False,
        'log_level': 'CRITICAL',
        'print_results': 'summary',
        'output_format': ''
    }
    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    urls = []
    #pprint(search)
    for serp in search.serps:
        for link in serp.links:
            print(link.getLink())
            urls.append(link.getLink())

    #pauseInterval = random.uniform(1, 20)
    #print(pauseInterval)
    #urls = search(query, stop=20, pause=pauseInterval)#last result to retrieve
    #return urls
    #sources = [

#"http://kskrishnan.blogspot.com/2010/09/bangalore-to-mysore.html",
#"https://www.makemytrip.com/blog/mysore-tales-1-making-way-to-mysores-hotspots",
#"http://rajivc-food-travel-life.blogspot.com/2015/05/trip-to-gods-own-country-bangalore-to.html"
#]
#ca_sources = [
#"https://www.tripline.net/trip/San_Francisco_to_San_Diego_on_the_PCH-7521703244561003A0278A25A729E901",
#"https://www.gapyear.com/articles/216212/13-incredible-stops-on-the-pacific-coast-highway",
#"http://moon.com/2015/08/road-trip-itinerary-san-diego-to-san-francisco-in-two-days/",
#"http://moon.com/2016/05/take-a-two-week-california-coast-road-trip/",
#"http://californiathroughmylens.com/pacific-coast-highway-stops",
#"http://californiathroughmylens.com/san-francisco-mendocino-guide",
#"http://www.heleninwonderlust.co.uk/2014/03/ultimate-california-road-trip-itinerary-las-vegas/",
#"http://www.worldofwanderlust.com/where-to-stop-on-the-pacific-coast-highway/",
#"http://www.visitcalifornia.com/trip/highway-one-classic",
#"http://independenttravelcats.com/2015/11/24/planning-a-california-pacific-coast-highway-road-trip-from-san-francisco-to-los-angeles/"
#]
#ca_sources1 = [
#"https://www.tripline.net/trip/San_Francisco_to_San_Diego_on_the_PCH-7521703244561003A0278A25A729E901",
#"https://www.gapyear.com/articles/216212/13-incredible-stops-on-the-pacific-coast-highway"]
#print(urls)
    return urls
示例#26
0
def basic_usage(products_parsed):
    local_anti = 0
    # See in the config.cfg file for possible values
    keywords = [y for x, y in products_parsed]
    config = {
        'use_own_ip': 'True',
        'search_engines': [
            'bing',
        ],
        'num_pages_for_keyword': 1,
        'num_results_per_page': 20,
        'num_workers': step,
        'keywords': keywords,
        'SELENIUM': {
            'sel_browser': 'chrome',
        },
        'do_caching': 'True'
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    # let's inspect what we got
    serps = sqlalchemy_session.serps
    loop = dict()
    for it, serp in enumerate(serps):
        loop[serp.query] = list()
        for link in serp.links:
            loop[serp.query].append({'link': link.link, 'title': link.title})

    for it in products_parsed:
        links = loop.get(it[1], None)
        if not links:
            local_anti += 1
            continue

        for link in links:

            if 'product' in link['link'] and 'instacart' in link['link']:
                req = requests.get(url=link['link'])
                if req.status_code != 404:
                    product_list[it[0]]['link'] = link['link']
                    product_list[it[0]]['title'] = link['title']
                    product_list[it[0]]['content'] = req.content
                    break
                else:
                    product_list[it[0]]['link'] = link['link']
                    product_list[it[0]]['title'] = link['title']
                    product_list[it[0]]['content'] = None

        if not product_list[it[0]].get('link', False):
            local_anti += 1

    return local_anti
示例#27
0
def image_search():
    # See in the config.cfg file for possible values
    config = {
        'SCRAPING': {
            'keyword': 'snow nature',
            'search_engines': 'yandex,google,bing,duckduckgo,yahoo,baidu',
            'search_type': 'image',
            'scrapemethod': 'http'
        }
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    image_urls = []
    search = sqlalchemy_session.query(ScraperSearch).all()[-1]

    for serp in search.serps:
        image_urls.extend([link.link for link in serp.links])

    import threading, requests, os

    class FetchResource(threading.Thread):
        """Grabs a web resource and stores it in the target directory"""
        def __init__(self, target, urls):
            super().__init__()
            self.target = target
            self.urls = urls

        def run(self):
            for url in self.urls:
                with open(os.path.join(self.target,
                                       url.split('/')[-1]), 'wb') as f:
                    f.write(requests.get(url).content)

    # make a directory for the results
    os.mkdir('images')

    # fire up 100 threads to get the images
    num_threads = 100

    threads = [FetchResource('images/', []) for i in range(num_threads)]

    while image_urls:
        for t in threads:
            t.urls.append(image_urls.pop())

    threads = [t for t in threads if t.urls]

    for t in threads:
        t.start()

    for t in threads:
        t.stop()
示例#28
0
    def test_csv_output_static(self):
        """Test csv output.

        Test parsing 4 html pages with two queries and two pages per query and
        transforming the results to csv format.

        The cached file should be saved in 'data/csv_tests/', there should
        be as many files as search_engine * pages_for_keyword

        The keyword used in the static SERP pages MUST be 'some words'

        The filenames must be in the GoogleScraper cache format.
        """

        import csv
        from GoogleScraper.output_converter import csv_fieldnames

        number_search_engines = len(all_search_engines)
        csv_outfile = 'data/tmp/csv_test.csv'

        config = {
            'SCRAPING': {
                'keyword': 'some words',
                'search_engines': ','.join(all_search_engines),
                'num_pages_for_keyword': 2,
                'scrape_method': 'selenium'
            },
            'GLOBAL': {
                'cachedir': 'data/csv_tests/',
                'do_caching': 'True',
                'verbosity': 0
            },
            'OUTPUT': {
                'output_filename': csv_outfile
            }
        }
        search = scrape_with_config(config)

        assert os.path.exists(csv_outfile), '{} does not exist'.format(csv_outfile)

        reader = csv.reader(open(csv_outfile, 'rt'))

        # the items that should always have a value:
        notnull = ('link', 'query', 'rank', 'domain', 'title', 'link_type', 'scrape_method', 'page_number', 'search_engine_name', 'snippet')

        for rownum, row in enumerate(reader):
            if rownum == 0:
                header = row
                header_keys = set(row)
                assert header_keys.issubset(set(csv_fieldnames)), 'Invalid CSV header: {}'.format(header)

            for item in notnull:
                assert row[header.index(item)], '{} has a item that has no value: {}'.format(item, row)

        self.assertAlmostEqual(number_search_engines * 2 * 10, rownum, delta=30)
示例#29
0
def extract_urls(keywords_file, companies_list, proxy_list=None):
    '''
    Use GoogleScraper to extract URLs based on the combination of company name
    and keywords, it will store the result in a .csv file and return the path
    and name of the file.

    Input: the path of the keywords file and company list. Should be txt file
    or file without format.

    Output: it will automatically create a json file and a sqlite db file to store all the query result
    and return the path of that file
    '''
    if proxy_list:
        proxy_file = proxy_list
    else:
        full_path = os.path.realpath(__file__)
        path, filename = os.path.split(full_path)
        # print(path, filename)
        proxy_file = os.path.join(path, "ProxyProvider", "proxy.txt")

    query = create_query(keywords_file, companies_list)
    if len(query) == 0:
        print("All queries been scraped")
        return companies_list + '.json'
    config = {
        'use_own_ip': False,
        'keywords': query,
        'check_proxies': False,
        'search_engines': 'google',
        'stop_on_detection': False,
        # 'google_sleeping_ranges': 5,
        'num_pages_for_keyword': 1,
        'scrape_method': 'selenium',  # http or selenium
        'sel_browser': 'Phantomjs',
        'num_workers': 1,
        'verbosity': 2,
        'do_caching': False,
        # 'sleeping_ranges': '5: 5, 10',
        'google_search_url': 'http://www.google.com/search?',
        'proxy_file': proxy_file,
        'output_filename': companies_list + '.json',
        'database_name': companies_list,
    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    if companies_list.count('.') > 0:
        return companies_list[:companies_list.rindex('.')] + '.json'
    else:
        return companies_list + '.json'
示例#30
0
    def test_json_output_static(self):
        """Test json output.

        """

        import json

        number_search_engines = len(all_search_engines)
        json_outfile = 'data/tmp/json_test.json'

        config = {
            'SCRAPING': {
                'keyword': 'some words',
                'search_engines': ','.join(all_search_engines),
                'num_pages_for_keyword': 2,
                'scrape_method': 'selenium'
            },
            'GLOBAL': {
                'cachedir': 'data/json_tests/',
                'do_caching': 'True',
                'verbosity': 0
            },
            'OUTPUT': {
                'output_filename': json_outfile
            }
        }
        search = scrape_with_config(config)

        assert os.path.exists(json_outfile), '{} does not exist'.format(json_outfile)

        file = open(json_outfile, 'r')
        try:
            results = json.load(file)
        except ValueError as e:
            print('Cannot parse output json file {}. Reason: {}'.format(json_outfile, e))
            raise e

        # the items that should always have a value:
        notnull = ('link', 'rank', 'domain', 'title', 'link_type')
        num_results = 0
        for item in results:

            for k, v in item.items():

                if k == 'results':

                    for res in v:
                        num_results += 1

                        for item in notnull:
                            assert res[item], '{} has a item that has no value: {}'.format(item, res)

        self.assertAlmostEqual(number_search_engines * 2 * 10, num_results, delta=30)
示例#31
0
def run(self):
    n = self.param[
        'n'] * 10  # multiplier is set to collect far more results than required for establishing suggestions
    MAX_PER_PAGE = 100  # this is a limit imposed by GoogleScraper
    config = {
        'use_own_ip': True,
        'keyword': self.param['keywords'],
        'search_engines': ['bing'],
        'scrape_method': 'http',
        'do_caching': False,
        'log_level': self.verbose,
        'num_pages_for_keyword':
        len(range(0, n, MAX_PER_PAGE)) if n > 0 else 1,
        'num_results_per_page':
        min(n, MAX_PER_PAGE) if n > 0 else MAX_PER_PAGE,
    }
    if 'PROXY_FILE' in self.config and self.config['PROXY_FILE'] not in [
            None, ''
    ]:
        config.update({
            'proxy_file': self.config['PROXY_FILE'],
            'check_proxies': False
        })
        # NB: check_proxies is a parameter aimed to make the (public) proxy address checked on a website,
        #     so if using a private network proxy, this check is not required
    # scrape on keywords and get a connection to the cache database
    search = scrape_with_config(config)
    # check the status and raise an exception if scraping failed
    for serp in search.serps:
        if serp.status != 'successful' and serp.no_results:
            self.logger.error(serp.status)
            exit(1)
    # collect found links
    links, suggestions = [
        link for serp in search.serps for link in serp.links
    ], []
    k, l = 0, len(links)
    while len(suggestions) < self.param['n'] and k < l:
        # TODO: write a filter
        #  e.g. for:
        #  - favouring links with domain containing one or more of the keywords)
        #  - excluding links on specific forums and/or download sites
        #  - exluding maliious domains acording to Norton Safe Web or other security sources (e.g. Webputation)
        suggestions.append({
            'link': links[k].link,
            'title': links[k].title,
            'text': links[k].snippet
        })
        k += 1
    return suggestions
    def test_all_search_engines_in_selenium_mode(self):
        """
        Very simple test case that assures that scraping all
        search engines in selenium mode works.

        Basically copy paste from `test_all_search_engines_in_http_mode`.
        """

        config = {
            'keyword': 'dont look back in anger',
            'search_engines': '*',
            'scrape_method': 'selenium',
            'sel_browser': 'chrome',
            'browser_mode': 'headless',
            'chromedriver_path':
            '/home/nikolai/projects/private/Drivers/chromedriver',
            'do_caching': False,
            'num_results_per_page': 10,
        }

        search = scrape_with_config(config)

        self.assertLess(search.started_searching, search.stopped_searching)
        self.assertEqual(search.number_proxies_used, 1)
        self.assertEqual(search.number_search_engines_used,
                         len(all_search_engines))
        self.assertEqual(search.number_search_queries, 1)
        self.assertEqual(len(search.serps), len(all_search_engines))

        for i, serp in enumerate(search.serps):
            self.assertEqual(search.serps[i].page_number, 1)
            self.assertEqual(serp.status, 'successful')
            self.assertIn(serp.search_engine_name.lower(), all_search_engines)
            self.assertEqual(serp.scrape_method, 'selenium')
            self.assertTrue(serp.num_results_for_query)
            self.assertAlmostEqual(serp.num_results, 10, delta=2)
            self.assertFalse(is_string_and_longer_than(serp.effective_query,
                                                       1),
                             msg=serp.effective_query)
            self.assertEqual(serp.no_results, False)
            self.assertEqual(serp.num_results, len(serp.links))

            for j, link in enumerate(serp.links):
                if link.link_type == 'results':
                    self.assertTrue(is_string_and_longer_than(link.title, 3))
                    self.assertTrue(is_string_and_longer_than(link.snippet, 3))

                self.assertTrue(is_string_and_longer_than(link.link, 10))
                self.assertTrue(link.domain in link.link)
                self.assertTrue(isinstance(link.rank, int))
    def test_csv_file_header_always_the_same(self):
        """
        Check that csv files have always the same order in their header.
        """
        csv_outfile_1 = os.path.join(base, "data/tmp/csvout1.csv")
        csv_outfile_2 = os.path.join(base, "data/tmp/csvout2.csv")

        config = {
            "keyword": "some words",
            "search_engines": all_search_engines,
            "num_pages_for_keyword": 2,
            "scrape_method": "selenium",
            "cachedir": os.path.join(base, "data/csv_tests/"),
            "do_caching": True,
            "verbosity": 0,
            "output_filename": csv_outfile_1,
        }
        search = scrape_with_config(config)

        search = scrape_with_config(config)
        config.update({"output_filename": csv_outfile_2})
        search = scrape_with_config(config)

        assert os.path.isfile(csv_outfile_1) and os.path.isfile(csv_outfile_2)

        file1 = open(csv_outfile_1, "rt")
        file2 = open(csv_outfile_2, "rt")

        import csv

        reader1, reader2 = csv.DictReader(file1), csv.DictReader(file2)

        header1, header2 = reader1.fieldnames, reader2.fieldnames
        from GoogleScraper.output_converter import csv_fieldnames

        assert header1 == header2 == csv_fieldnames
    def test_json_output_static(self):
        """Test json output.

        """

        import json

        number_search_engines = len(all_search_engines)
        json_outfile = os.path.join(base, "data/tmp/json_test.json")

        config = {
            "keyword": "some words",
            "search_engines": all_search_engines,
            "num_pages_for_keyword": 2,
            "scrape_method": "selenium",
            "cachedir": os.path.join(base, "data/json_tests/"),
            "do_caching": True,
            "verbosity": 0,
            "output_filename": json_outfile,
        }
        search = scrape_with_config(config)

        assert os.path.exists(json_outfile), "{} does not exist".format(json_outfile)

        file = open(json_outfile, "r")
        try:
            results = json.load(file)
        except ValueError as e:
            print("Cannot parse output json file {}. Reason: {}".format(json_outfile, e))
            raise e

        # the items that should always have a value:
        notnull = ("link", "rank", "domain", "title", "link_type")
        num_results = 0
        for item in results:

            for k, v in item.items():

                if k == "results":

                    for res in v:
                        num_results += 1

                        for item in notnull:
                            assert res[item], "{} has a item that has no value: {}".format(item, res)

        self.assertAlmostEqual(number_search_engines * 2 * 10, num_results, delta=30)
    def scrapeArt(self, artistName, albumName, filename):
        searchQuery = artistName + " " + albumName
        # Configure the scraper
        config = {
            "SCRAPING": {
                "keyword": searchQuery,
                "search_engines": "google",
                "search_type": "image",
                "scrape_method": "http",
                "num_results_per_page:": 1,
                "num_pages_for_keyword": 1,
            },
            "GLOBAL": {"verbosity": "0", "do_caching": "False"},
        }
        # Run the search and scrape results
        try:
            search = scrape_with_config(config)
        except GoogleSearchError as e:
            print(e)

            # Save the first image search result
        image_url = search.serps[0].links[0].link

        # Parse image url into a usable format
        url = urllib.parse.unquote(image_url)
        # Write the image data to file

        target_directory = os.path.join(self.outputDir, artistName + "/")
        target_directory = os.path.join(target_directory, albumName + "/")

        fileExt = findFileExtension(url)

        print(self.albumName)

        if fileExt == url:
            print("File extension: ", fileExt, " is not supported.")
            print("Error occured on Artist: ", self.artistName, " Album: ", self.albumName)
            return

        finalFilename = filename + fileExt

        with open(os.path.join(target_directory, finalFilename), "wb") as f:
            try:
                content = requests.get(url).content
                f.write(content)
            except Exception as e:
                pass
示例#36
0
    def scrape_query(self,
                     mode,
                     search_engines='*',
                     query='',
                     random_query=False,
                     sel_browser='Chrome'):

        if random_query:
            query = random_word()

        config = {
            'SCRAPING': {
                'use_own_ip': 'True',
                'keyword': query,
                'search_engines': search_engines,
                'num_pages_for_keyword': 1,
                'scrape_method': mode,
            },
            'GLOBAL': {
                'do_caching': 'False',
                'verbosity': 0
            },
            'SELENIUM': {
                'sel_browser': sel_browser
            }
        }
        search = scrape_with_config(config)

        if search_engines == '*':
            assert search.number_search_engines_used == len(all_search_engines)
        else:
            assert search.number_search_engines_used == len(
                search_engines.split(','))

        if search_engines == '*':
            assert len(search.used_search_engines.split(',')) == len(
                all_search_engines)
        else:
            assert len(search.used_search_engines.split(',')) == len(
                search_engines.split(','))

        assert search.number_proxies_used == 1
        assert search.number_search_queries == 1
        assert search.started_searching < search.stopped_searching

        return search
    def test_all_search_engines_in_http_mode(self):
        """
        Very simple test case that assures that scraping all
        search engines in http mode works.
        """

        config = {
            'keyword': 'in this world',
            'search_engines': '*',
            'scrape_method': 'http',
            'do_caching': False,
            'num_results_per_page': 10,
            'log_level': 'WARNING',
            'print_results': 'summarize',
        }

        search = scrape_with_config(config)

        self.assertLess(search.started_searching, search.stopped_searching)
        self.assertEqual(search.number_proxies_used, 1)
        self.assertEqual(search.number_search_engines_used,
                         len(all_search_engines))
        self.assertEqual(search.number_search_queries, 1)
        self.assertEqual(len(search.serps), len(all_search_engines))

        for i, serp in enumerate(search.serps):
            self.assertEqual(search.serps[i].page_number, 1)
            self.assertEqual(serp.status, 'successful')
            self.assertIn(serp.search_engine_name.lower(), all_search_engines)
            self.assertEqual(serp.scrape_method, 'http')
            self.assertTrue(serp.num_results_for_query)
            self.assertTrue(serp.num_results >= 7)
            self.assertFalse(is_string_and_longer_than(serp.effective_query,
                                                       1),
                             msg=serp.effective_query)
            self.assertEqual(serp.num_results, len(serp.links))

            for j, link in enumerate(serp.links):
                if link.link_type == 'results':
                    self.assertTrue(is_string_and_longer_than(link.title, 3))
                    # no snippet needed actually
                    # self.assertTrue(is_string_and_longer_than(link.snippet, 3))

                self.assertTrue(is_string_and_longer_than(link.link, 10))
                self.assertTrue(link.domain in link.link)
                self.assertTrue(isinstance(link.rank, int))
示例#38
0
    def slat_(self, config):
        try:
            if str('wiki') in config['search_engines']:
                get_links = (str('wikipedia'), 0, None, config['keyword'],
                             None)
                wiki_get(get_links)

            elif str('info_wars') in config['search_engines']:
                get_links = (str('info_wars'), 0, None, config['keyword'],
                             None)
                info_wars_get(get_links)

            elif str('scholar') in config['search_engines']:
                get_links = (str('scholar'), 0, None, config['keyword'], None)
                search_scholar(get_links)

            elif str('scholarpedia') in config['search_engines']:
                get_links = (str('scholar'), 0, None, config['keyword'], None)
                scholar_pedia_get(get_links)

            else:
                search = scrape_with_config(config)
                links = []
                for serp in search.serps:
                    print(serp)
                    links.extend([link.link for link in serp.links])

                # This code block jumps over gate two
                # The (possibly private, or hosted server as a gatekeeper).
                if len(links) > self.NUM_LINKS: links = links[0:self.NUM_LINKS]
                if len(links) > 0:
                    print(links)
                    buffer = None
                    se_ = config['search_engines']
                    category = config['keyword']
                    get_links = ((se_, index, link, category, buffer)
                                 for index, link in enumerate(links))
                    for gl in get_links:
                        process(gl)
                    # map over the function in parallel since it's 2018
                    #b = db.from_sequence(get_links,npartitions=8)
                    #_ = list(b.map(process).compute())
        except GoogleSearchError as e:
            print(e)
            return None
        print('done scraping')
 def fetch_image_urls(self, keyword, num_urls):
     self.gscraper_config['SCRAPING']['keyword'] = keyword
     self.gscraper_config['SCRAPING']['num_pages_for_keyword'] =\
         self.get_num_pages_from_num_urls(num_urls)
     try:
         search = scrape_with_config(self.gscraper_config)
     except GoogleSearchError as e:
         logging.info(e)
         search = ''
         return
     image_urls = list()
     for serp in search.serps:
         image_urls.extend([link.link for link in serp.links])
     if num_urls > len(image_urls):
         return image_urls
     else:
         return image_urls[:num_urls]
示例#40
0
    def test_all_search_engines_in_selenium_mode(self):
        """
        Very simple test case that assures that scraping all
        search engines in selenium mode works.

        Basically copy paste from `test_all_search_engines_in_http_mode`.
        """

        config = {
            'keyword': 'dont look back in anger',
            'search_engines': '*',
            'scrape_method': 'selenium',
            'sel_browser': 'chrome',
            'browser_mode': 'headless',
            'chromedriver_path': '/home/nikolai/projects/private/Drivers/chromedriver',
            'do_caching': False,
            'num_results_per_page': 10,
        }

        search = scrape_with_config(config)

        self.assertLess(search.started_searching, search.stopped_searching)
        self.assertEqual(search.number_proxies_used, 1)
        self.assertEqual(search.number_search_engines_used, len(all_search_engines))
        self.assertEqual(search.number_search_queries, 1)
        self.assertEqual(len(search.serps), len(all_search_engines))

        for i, serp in enumerate(search.serps):
            self.assertEqual(search.serps[i].page_number, 1)
            self.assertEqual(serp.status, 'successful')
            self.assertIn(serp.search_engine_name.lower(), all_search_engines)
            self.assertEqual(serp.scrape_method, 'selenium')
            self.assertTrue(serp.num_results_for_query)
            self.assertAlmostEqual(serp.num_results, 10, delta=2)
            self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query)
            self.assertEqual(serp.no_results, False)
            self.assertEqual(serp.num_results, len(serp.links))

            for j, link in enumerate(serp.links):
                if link.link_type == 'results':
                    self.assertTrue(is_string_and_longer_than(link.title, 3))
                    self.assertTrue(is_string_and_longer_than(link.snippet, 3))

                self.assertTrue(is_string_and_longer_than(link.link, 10))
                self.assertTrue(link.domain in link.link)
                self.assertTrue(isinstance(link.rank, int))
示例#41
0
def run_job(tilte, url):
    query = tilte

    config = {
        'use_own_ip': True,
        'keyword': query,
        'search_engines': ['Google'],
        # 'num_results_per_page': 10,  # this is ignored by bing, 10 results per page
        'num_pages_for_keyword': 100,
        'scrape_method': 'selenium',
        'num_workers': 4,
        # 'scrape_method': 'http',
        'sel_browser': 'chrome',
        # 'do_sleep': False,
        # 'browser_mode': 'normal',
        'browser_mode': 'headless',
        # 'chromedriver_path': '/Users/johnny/Downloads/chromedriver',
        'chromedriver_path': '/app/chromeDriver/chromedriver',
        'do_caching': False,
        # 'print_results': 'summarize',
        'google_search_url': url,
    }

    search = scrape_with_config(config)

    result = []

    print(search.serps)
    for serp in search.serps:
        for link in serp.links:
            if link.snippet and link.visible_link:
                title = link.snippet.replace("\n", "")
                link = link.visible_link

                if len(title) > 50:
                    title = f"{title[:30]}..."

                if 'https' not in link[:5]:
                    link = f'http://{link}'

                    result.append({'title': title, 'link': link})

                print(title)
                print(link)
                print("-------")
    return result
示例#42
0
def main(arg_list):
    # Get the arguments
    path = arg_list[0]
    keyword = arg_list[1]

    # Create our target directory if it doesn't exist
    if not os.path.exists(path):
        os.mkdir(path)

    # Create our configuration file
    config = ConfigFactory.create_config(keyword)

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError:
        print('Error!')

    image_urls = []
    search = sqlalchemy_session.query(ScraperSearch).all()[-1]

    for serp in search.serps:
        image_urls.extend([link.link for link in serp.links])

    print('[i] Going to scrape {num} images and saving them in "{dir}"'.format(
        num=len(image_urls),
        dir=path
    ))

    thread_count = 100
    threads = [FetchResource(path, []) for i in range(thread_count)]

    while image_urls:
        for thread in threads:
            try:
                thread.urls.append(image_urls.pop())
            except IndexError:
                break

    threads = [thread for thread in threads if thread.urls]

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
示例#43
0
def search(data):
    global CONFIG

    CONFIG['keyword'] = data

    try:
        result = scrape_with_config(CONFIG)
    except (GoogleSearchError, SocketError) as _:
        return 'Not found'

    buf = []

    for serp in result.serps:
        for link in serp.links:
            if link.snippet is not None:
                buf.append(link.snippet.strip()+os.linesep)

    return os.linesep.join(buf)
示例#44
0
def scraper():
    os.remove("google_scraper.db")

    config = {
        'use_own_ip': True,
        'keyword_file': 'sent.txt',
        # 'bing_search_url' : 'http://www.bing.com/?mkt=zh-CN',
        'search_engines': ['google'],
        # 'search_engines': ['bing'],
        'num_pages_for_keyword': 1,
        'scrape_method': 'selenium',
        # 'scrape_method': 'http-async',
        #'sel_browser': 'firefox', # uncomment one when using selenium mode
        'sel_browser': 'chrome',
        'do_caching': False,
        'clean_cache_files': False,
        'print_results': 'summarize',
        # 'output_filename': 'out.csv', # added for async mode
        # 'google_sleeping_ranges' : { \
        #     1:  (2, 3), \
        #     5:  (3, 5), \
        #     30: (10, 20), \
        #     127: (30, 50), \
        #     }
    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    num_result = []
    for serp in search.serps:
        # print("-------------------------------------------------------")
        if not serp.effective_query:
            # if serp.no_results == False: # not work for bing
            num_result.append(serp.num_results_for_query)
        else:
            num_result.append("0")  # no result
#     print(serp.num_results_for_query)
#     print(serp.effective_query)
#     print(serp.no_results)

    return num_result
def getUrls(keyword):
    print(keyword)

    config = {
        'use_own_ip': 'False',
        'keyword': keyword + " site:en.wikipedia.org",
        'search_engines': ['bing', ],
        'num_pages_for_keyword': 1,
        'scrape_method': 'http',
        'do_caching': 'False',

    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    return search.serps
示例#46
0
    def test_all_search_engines_in_http_mode(self):
        """
        Very simple test case that assures that scraping all
        search engines in http mode works.
        """

        config = {
            'keyword': 'in this world',
            'search_engines': '*',
            'scrape_method': 'http',
            'do_caching': False,
            'num_results_per_page': 10,
            'log_level': 'WARNING',
            'print_results': 'summarize',
        }

        search = scrape_with_config(config)

        self.assertLess(search.started_searching, search.stopped_searching)
        self.assertEqual(search.number_proxies_used, 1)
        self.assertEqual(search.number_search_engines_used, len(all_search_engines))
        self.assertEqual(search.number_search_queries, 1)
        self.assertEqual(len(search.serps), len(all_search_engines))

        for i, serp in enumerate(search.serps):
            self.assertEqual(search.serps[i].page_number, 1)
            self.assertEqual(serp.status, 'successful')
            self.assertIn(serp.search_engine_name.lower(), all_search_engines)
            self.assertEqual(serp.scrape_method, 'http')
            self.assertTrue(serp.num_results_for_query)
            self.assertTrue(serp.num_results >= 7)
            self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query)
            self.assertEqual(serp.num_results, len(serp.links))

            for j, link in enumerate(serp.links):
                if link.link_type == 'results':
                    self.assertTrue(is_string_and_longer_than(link.title, 3))
                    # no snippet needed actually
                    # self.assertTrue(is_string_and_longer_than(link.snippet, 3))

                self.assertTrue(is_string_and_longer_than(link.link, 10))
                self.assertTrue(link.domain in link.link)
                self.assertTrue(isinstance(link.rank, int))
示例#47
0
def related_search():
    target_directory = 'related/'

    # See in the config.cfg file for possible values
    config = {
            'keyword': 'web siling', # :D hehe have fun my dear friends
            'search_engines': 'yahoo', # duckduckgo not supported
            'search_type': 'related',
            'scrapemethod': 'selenium'
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print(">>>", e)

    for search in sqlalchemy_session.query(ScraperSearch).all():
        for serp in search.serps:
            # print(serp, dir(serp))
            for keyword in serp.keywords:
                print(keyword)
示例#48
0
 def scrap(self, keyword):
     keywords = [keyword]
     # See in the config.cfg file for possible values
     config = {
         'use_own_ip': 'False',
         'keywords': keywords,
         'search_engines': ['google'],
         'num_pages_for_keyword': 2,
         'scrape_method': 'http',  # selenium
         # 'sel_browser': 'chrome', uncomment if scrape_method is selenium
         # 'executable_path': 'path\to\chromedriver' or 'path\to\phantomjs',
         'do_caching': 'True',
         'cachedir': '/tmp/.scrapecache/',
         'database_name': '/tmp/google_scraper',
         'clean_cache_after': 24,
         'output_filename': None,
         'print_results': 'all',
     }
     try:
         return scrape_with_config(config)
     except GoogleSearchError:
         print(traceback.print_exc())
示例#49
0
    def scrape_query(self, mode, search_engines='*', query='', random_query=False, sel_browser='Chrome'):

        if random_query:
            query = random_word()

        config = {
            'SCRAPING': {
                'use_own_ip': 'True',
                'keyword': query,
                'search_engines': search_engines,
                'num_pages_for_keyword': 1,
                'scrape_method': mode,
            },
            'GLOBAL': {
                'do_caching': 'False',
                'verbosity': 0
            },
            'SELENIUM': {
                'sel_browser': sel_browser
            }
        }
        search = scrape_with_config(config)

        if search_engines == '*':
            assert search.number_search_engines_used == len(all_search_engines)
        else:
            assert search.number_search_engines_used == len(search_engines.split(','))

        if search_engines == '*':
            assert len(search.used_search_engines.split(',')) == len(all_search_engines)
        else:
            assert len(search.used_search_engines.split(',')) == len(search_engines.split(','))

        assert search.number_proxies_used == 1
        assert search.number_search_queries == 1
        assert search.started_searching < search.stopped_searching

        return search
 def fetch_info(self, keyword):
     self.gscraper_config['SCRAPING']['keyword'] = keyword
     info = defaultdict(dict)
     info['num_results_for_query']['baidu'] = 0
     info['num_results_for_query']['google'] = 0
     for i in range(0, RETRY):
         try:
             search = scrape_with_config(self.gscraper_config)
         except GoogleSearchError as e:
             logging.info(e)
             search = ''
             return
         for serp in search.serps:
             text = serp.num_results_for_query
             if 'baidu' in serp.search_engine_name:
                 info['num_results_for_query']['baidu'] = int(cogtu_misc.get_first_number_from_text(text))
             elif 'google' in serp.search_engine_name:
                 info['num_results_for_query']['google'] = int(cogtu_misc.get_first_number_from_text(text))
         if info['num_results_for_query']['baidu'] is not 0 or\
                 info['num_results_for_query']['google'] is not 0:
             break
         logging.info('RETRYING...')
     return info
示例#51
0
def run(self):
    n = self.param['n'] * 10  # multiplier is set to collect far more results than required for establishing suggestions
    MAX_PER_PAGE = 100        # this is a limit imposed by GoogleScraper
    config = {
        'use_own_ip': True,
        'keyword': self.param['keywords'],
        'search_engines': ['google'],
        'scrape_method': 'http',
        'do_caching': False,
        'log_level': self.verbose,
        'num_pages_for_keyword': len(range(0, n, MAX_PER_PAGE)) if n > 0 else 1,
        'num_results_per_page': min(n, MAX_PER_PAGE) if n > 0 else MAX_PER_PAGE,
    }
    if 'PROXY_FILE' in self.config and self.config['PROXY_FILE'] not in [None, '']:
        config.update({'proxy_file': self.config['PROXY_FILE'], 'check_proxies': False})
        # NB: check_proxies is a parameter aimed to make the (public) proxy address checked on a website,
        #     so if using a private network proxy, this check is not required
    # scrape on keywords and get a connection to the cache database
    search = scrape_with_config(config)
    # check the status and raise an exception if scraping failed
    for serp in search.serps:
        if serp.status != 'successful' and serp.no_results:
            self.logger.error(serp.status)
            exit(1)
    # collect found links
    links, suggestions = [link for serp in search.serps for link in serp.links], []
    k, l = 0, len(links)
    while len(suggestions) < self.param['n'] and k < l:
        # TODO: write a filter
        #  e.g. for:
        #  - favouring links with domain containing one or more of the keywords)
        #  - excluding links on specific forums and/or download sites
        #  - exluding maliious domains acording to Norton Safe Web or other security sources (e.g. Webputation)
        suggestions.append({'link': links[k].link, 'title': links[k].title, 'text': links[k].snippet})
        k += 1
    return suggestions
示例#52
0
def run_crawler(searchString, jsonFileName):
    # See in the config.cfg file for possible values
    config = {
        'keyword': searchString,
        'search_engines':['google', 'bing'],
        'num_pages_for_keyword': 10,
        'output_filename': jsonFileName,
        'SCRAPING': {
            'use_own_ip': 'True',
            'num_pages_for_keyword': 1
        },
        'SELENIUM': {
            'sel_browser': 'chrome',
        },
        'GLOBAL': {
            'do_caching': 'False'
        }
    }

    try:
        sqlalchemy_session = scrape_with_config(config)
    except GoogleSearchError as e:
        print("GoogleSearchError")
        print(e)
示例#53
0
    def test_asynchronous_mode_bing_and_yandex(self):
        """
        Expected results:
        - around 60 results
        - 30 results for bing and 30 results for yandex
        - valid json file with the contents
        """
        results_file = os.path.join(tempfile.gettempdir(), 'async_results.json')
        if os.path.exists(results_file):
            os.remove(results_file)

        config = {
            'keyword': 'where is my mind',
            'search_engines': ['bing', 'yandex'],
            'num_results_per_page': 10,
            'num_pages_for_keyword': 3,
            'scrape_method': 'http-async',
            'output_filename': results_file,
            'do_caching': False,
        }

        search = scrape_with_config(config)

        self.assertEqual(search.keyword_file, '')
        self.assertLess(search.started_searching, search.stopped_searching)
        self.assertEqual(search.number_proxies_used, 1)
        self.assertEqual(search.number_search_engines_used, 2)
        self.assertEqual(search.number_search_queries, 1)
        self.assertEqual(len(search.serps), 6)

        # test that we have twice [1,2,3] as page numbers
        self.assertSetEqual(set([serp.page_number for serp in search.serps]), {1,2,3})

        self.assertAlmostEqual(sum([len(serp.links) for serp in search.serps]), 60, delta=10)
        self.assertAlmostEqual(sum([len(serp.links) for serp in search.serps if serp.search_engine_name == 'yandex']), 30, delta=5)
        self.assertAlmostEqual(sum([len(serp.links) for serp in search.serps if serp.search_engine_name == 'bing']), 30, delta=5)

        for serp in search.serps:
            self.assertEqual(serp.query, 'where is my mind')
            self.assertEqual(serp.status, 'successful')
            self.assertIn(serp.search_engine_name.lower(), ('bing', 'yandex'))
            self.assertEqual(serp.scrape_method, 'http-async')
            if serp.search_engine_name != 'yandex':
                self.assertTrue(is_string_and_longer_than(serp.num_results_for_query, 5))
            self.assertAlmostEqual(serp.num_results, 10, delta=2)
            self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query)
            self.assertEqual(serp.num_results, len(serp.links))

            predicate_true_at_least_n_times(lambda v: is_string_and_longer_than(v, 3),
                                                    serp.links, 7, 'snippet')
            for link in serp.links:
                if link.link_type == 'results':
                    self.assertTrue(is_string_and_longer_than(link.title, 3))

                self.assertTrue(is_string_and_longer_than(link.link, 10))
                self.assertTrue(isinstance(link.rank, int))

        # test that the json output is correct
        self.assertTrue(os.path.isfile(results_file))

        with open(results_file, 'rt') as file:
            obj = json.load(file)

            # check the same stuff again for the json file
            for i, page in enumerate(obj):
                self.assertEqual(page['effective_query'], '')
                self.assertEqual(page['num_results'], str(len(page['results'])))
                if page['search_engine_name'].lower() != 'yandex':
                    self.assertTrue(is_string_and_longer_than(page['num_results_for_query'], 5))
                self.assertEqual(page['query'], 'where is my mind')
                self.assertEqual(page['requested_by'], 'localhost')

                for j, result in enumerate(page['results']):
                    if result['link_type'] == 'results':
                        self.assertTrue(is_string_and_longer_than(result['title'], 3))
                        self.assertTrue(is_string_and_longer_than(result['snippet'], 3))

                    self.assertTrue(is_string_and_longer_than(result['link'], 10))
                    self.assertTrue(isinstance(int(result['rank']), int))
示例#54
0
    def test_google_with_chrome_and_json_output(self):
            """
            Very common use case:

            Ensures that we can scrape three continuous sites with Google using
            chrome in normal mode and save the results to a JSON file.
            """
            results_file = os.path.join(tempfile.gettempdir(), 'results-chrome.json')
            if os.path.exists(results_file):
                os.remove(results_file)


            query = 'Food New York'

            config = {
                'keyword': query,
                'search_engines': ['Google'],
                'num_results_per_page': 100,
                'num_pages_for_keyword': 3,
                'scrape_method': 'selenium',
                'sel_browser': 'chrome',
                'do_sleep': False,
                'browser_mode': 'normal',
                'chromedriver_path': '/home/nikolai/projects/private/Drivers/chromedriver',
                'output_filename': results_file,
                'do_caching': False,
            }

            search = scrape_with_config(config)

            self.assertLess(search.started_searching, search.stopped_searching)
            self.assertEqual(search.number_proxies_used, 1)
            self.assertEqual(search.number_search_engines_used, 1)
            self.assertEqual(search.number_search_queries, 1)
            self.assertEqual(len(search.serps), 3)

            self.assertEqual(search.serps[0].page_number, 1)
            self.assertEqual(search.serps[1].page_number, 2)
            self.assertEqual(search.serps[2].page_number, 3)

            for serp in search.serps:
                self.assertEqual(serp.status, 'successful')
                self.assertEqual(serp.search_engine_name.lower(), 'google')
                self.assertEqual(serp.scrape_method, 'selenium')
                self.assertTrue(serp.num_results_for_query)
                self.assertAlmostEqual(int(serp.num_results), 100, delta=10)
                self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query)
                self.assertEqual(serp.no_results, False)
                self.assertEqual(serp.num_results, len(serp.links))

                for j, link in enumerate(serp.links):
                    if link.link_type == 'results':
                        self.assertTrue(is_string_and_longer_than(link.title, 3))
                        self.assertTrue(is_string_and_longer_than(link.snippet, 3))

                    self.assertTrue(is_string_and_longer_than(link.link, 10))
                    self.assertTrue(isinstance(link.rank, int))


            # test that the json output is correct
            self.assertTrue(os.path.isfile(results_file))

            with open(results_file, 'rt') as file:
                obj = json.load(file)

                # check the same stuff again for the json file
                for i, page in enumerate(obj):
                    self.assertEqual(page['effective_query'], '')
                    self.assertEqual(page['no_results'], 'False')
                    self.assertEqual(page['num_results'], str(len(page['results'])))
                    self.assertAlmostEqual(int(page['num_results']), 100, delta=10)
                    self.assertTrue(is_string_and_longer_than(page['num_results_for_query'], 5))
                    self.assertEqual(page['page_number'], str(i+1))
                    self.assertEqual(page['query'], query)
                    # todo: Test requested_at
                    self.assertEqual(page['requested_by'], 'localhost')

                    for j, result in enumerate(page['results']):
                        if result['link_type'] == 'results':
                            self.assertTrue(is_string_and_longer_than(result['title'], 3))
                            self.assertTrue(is_string_and_longer_than(result['snippet'], 3))

                        self.assertTrue(is_string_and_longer_than(result['link'], 10))
                        self.assertTrue(isinstance(int(result['rank']), int))
示例#55
0
    def test_http_mode_google_csv_output(self):

        results_file = os.path.join(tempfile.gettempdir(), 'results.csv')
        if os.path.exists(results_file):
            os.remove(results_file)

        config = {
            'keyword': 'banana',
            'search_engines': ['Google'],
            'num_results_per_page': 10,
            'num_pages_for_keyword': 2,
            'scrape_method': 'http',
            'output_filename': results_file,
            'do_caching': False,
        }

        search = scrape_with_config(config)

        self.assertLess(search.started_searching, search.stopped_searching)
        self.assertEqual(search.number_proxies_used, 1)
        self.assertEqual(search.number_search_engines_used, 1)
        self.assertEqual(search.number_search_queries, 1)
        self.assertEqual(len(search.serps), 2)

        self.assertEqual(search.serps[0].page_number, 1)
        self.assertEqual(search.serps[1].page_number, 2)

        for serp in search.serps:
            self.assertEqual(serp.query, 'banana')
            self.assertEqual(serp.status, 'successful')
            self.assertEqual(serp.search_engine_name.lower(), 'google')
            self.assertEqual(serp.scrape_method, 'http')
            self.assertTrue(serp.num_results_for_query)
            self.assertAlmostEqual(serp.num_results, 10, delta=2)
            self.assertFalse(is_string_and_longer_than(serp.effective_query, 1), msg=serp.effective_query)
            self.assertEqual(serp.no_results, False)

            self.assertEqual(serp.num_results, len(serp.links))

            predicate_true_at_least_n_times(lambda v: is_string_and_longer_than(v, 3),
                                                    serp.links, 7, 'snippet')
            for link in serp.links:
                if link.link_type == 'results':
                    self.assertTrue(is_string_and_longer_than(link.title, 3))

                self.assertTrue(is_string_and_longer_than(link.link, 10))
                self.assertTrue(isinstance(link.rank, int))

        # test that the csv output is correct
        self.assertTrue(os.path.isfile(results_file))

        with open(results_file, 'rt') as file:
            reader = csv.DictReader(file, delimiter=',')

            rows = [row for row in reader]

            self.assertAlmostEqual(20, len(rows), delta=3)

            for row in rows:
                self.assertEqual(row['query'], 'banana')
                self.assertTrue(is_string_and_longer_than(row['requested_at'], 5))
                self.assertTrue(int(row['num_results']))
                self.assertEqual(row['scrape_method'], 'http')
                self.assertEqual(row['requested_by'], 'localhost')
                self.assertEqual(row['search_engine_name'], 'google')
                self.assertIn(int(row['page_number']), [1,2])
                self.assertEqual(row['status'], 'successful')
                self.assertTrue(row['no_results'] == 'False')
                self.assertTrue(row['effective_query'] == '')

                if row['link_type'] == 'results':
                    self.assertTrue(is_string_and_longer_than(row['title'], 3))
                    self.assertTrue(is_string_and_longer_than(row['snippet'], 3))
                    self.assertTrue(is_string_and_longer_than(row['domain'], 5))
                    self.assertTrue(is_string_and_longer_than(row['visible_link'], 5))
                    self.assertTrue(is_string_and_longer_than(row['num_results_for_query'], 3))

                self.assertTrue(is_string_and_longer_than(row['link'], 10))
                self.assertTrue(row['rank'].isdigit())

            # ensure that at least 90% of all entries have a string as snippet
            predicate_true_at_least_n_times(lambda v: is_string_and_longer_than(v, 3), rows, int(0.8*len(rows)), 'snippet')
示例#56
0
def image_search(query, engines, pages):
    target_directory = 'images/'

    # See in the config.cfg file for possible values
    config = {
        'use_own_ip': True,
        'keyword': query,
        'search_engines': [engines],
        'search_type': 'image',
        'num_pages_for_keyword': pages,
        'scrape_method': 'http',
        'do_caching': False
    }

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    image_urls = []
    search = search.query(ScraperSearch).all()[-1]

    for serp in search.serps:
        image_urls.extend(
            [link.link for link in serp.links]
        )

    print('[i] Going to scrape {num} images and saving them in "{dir}"'.format(
        num=len(image_urls),
        dir=target_directory
    ))

    import threading,requests, os, urllib

    class FetchResource(threading.Thread):
        """Grabs a web resource and stores it in the target directory"""
        def __init__(self, target, urls):
            super().__init__()
            self.target = target
            self.urls = urls

        def run(self):
            for url in self.urls:
                url = urllib.parse.unquote(url)
                with open(os.path.join(self.target, url.split('/')[-1]), 'wb') as f:
                    try:
                        content = requests.get(url).content
                        f.write(content)
                    except Exception as e:
                        pass
                    print('[+] Fetched {}'.format(url))

    # make a directory for the results
    try:
        os.mkdir(target_directory)
    except FileExistsError:
        pass

    # fire up 100 threads to get the images
    num_threads = 100

    threads = [FetchResource('images/', []) for i in range(num_threads)]

    while image_urls:
        for t in threads:
            try:
                t.urls.append(image_urls.pop())
            except IndexError as e:
                break

    threads = [t for t in threads if t.urls]

    for t in threads:
        t.start()

    for t in threads:
        t.join()
示例#57
0
"""

from GoogleScraper import scrape_with_config, GoogleSearchError

if __name__ == '__main__':
    # See in the config.cfg file for possible values
    config = {
        'SCRAPING': {
            'use_own_ip': 'True',
            'keyword': 'Hello World'
        },
        'SELENIUM': {
            'sel_browser': 'chrome',
            'manual_captcha_solving': 'True'
        },
        'GLOBAL': {
            'do_caching': 'True'
        }
    }

    try:
        # scrape() and scrape_with_config() will reuturn a handle to a sqlite database with the results
        db = scrape_with_config(config)

        print(db.execute('SELECT * FROM link').fetchall())

    except GoogleSearchError as e:
        print(e)


                ['"{}"'.format(s) for s in sentence.split(',') if len(s) > 25]
            )

    return chunks

# write the chunks to a file
with open('chunks.txt', 'wt') as f:

    for chunk in make_chunks(text):
        f.write(chunk + '\n')

# # See in the config.cfg file for possible values
config = {
    'use_own_ip': True,
    'keyword_file': 'chunks.txt',
    'search_engines': ['google'],
    'num_pages_for_keyword': 1,
    'scrape_method': 'selenium',
    'sel_browser': 'chrome',
}

try:
    search = scrape_with_config(config)
except GoogleSearchError as e:
    print(e)

for serp in search.serps:

    # if the original query yielded some results and thus was found by google.
    if not serp.effective_query:
        print('Found plagiarized content: "{}"'.format(serp.query))