if self.cache_manager:
                        self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                      scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query)

                        if self.scraper_search:
                            self.scraper_search.serps.append(serp)

                        if self.session:
                            self.session.add(serp)
                            self.session.commit()

                        store_serp_result(serp, self.config)
        print("----------------------------end def run(self):-----------------------------")


if __name__ == '__main__':
    from GoogleScraper.config import get_config
    from GoogleScraper.scrape_jobs import default_scrape_jobs_for_keywords

    some_words = get_some_words(n=1)

    cfg = get_config()
    scrape_jobs = list(default_scrape_jobs_for_keywords(some_words, ['bing'], 'http-async', 1))

    manager = AsyncScrapeScheduler(cfg, scrape_jobs)
    manager.run()
Exemplo n.º 2
0
When testing single functions:

python -m pytest tests/functional_tests.py::GoogleScraperFunctionalTestCase::test_google_with_chrome_and_json_output
"""

import csv
import json
import tempfile
import os
import unittest
from GoogleScraper import scrape_with_config
from GoogleScraper.config import get_config
import testing_utils

base_config = get_config()
all_search_engines = base_config['supported_search_engines']


def is_string_and_longer_than(s, n):
    return isinstance(s, str) and len(s) > n


def predicate_true_at_least_n_times(pred, collection, n, key):
    """
    Ensures that the predicate is at least n times true for
    items in a collection with the key.
    """
    if hasattr(collection[0], key):
        assert len([getattr(v, key) for v in collection if pred(getattr(v, key))]) > n
    elif key in collection[0]:
Exemplo n.º 3
0
def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
        config_from_dict: Configuration that is passed when GoogleScraper is called as library.
    Returns:
        A database session to the results when return_results is True. Else, nothing.
    """
    external_config_file_path = cmd_line_args = None

    if parse_cmd_line:
        cmd_line_args = get_command_line()

        if cmd_line_args.get('config_file', None):
            external_config_file_path = os.path.abspath(
                cmd_line_args.get('config_file'))

    config = get_config(cmd_line_args, external_config_file_path,
                        config_from_dict)

    if isinstance(config['log_level'], int):
        config['log_level'] = logging.getLevelName(config['log_level'])

    setup_logger(level=config.get('log_level').upper())

    if config.get('view_config', False):
        print(open(os.path.join(get_base_path(), 'scrape_config.py')).read())
        return

    if config.get('version'):
        from GoogleScraper.version import __version__
        print(__version__)
        return

    if config.get('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(config.get('cachedir')))
        except:
            pass
        return

    init_outfile(config, force_reload=True)

    kwfile = config.get('keyword_file', '')
    if kwfile:
        kwfile = os.path.abspath(kwfile)

    keyword = config.get('keyword')
    keywords = set(config.get('keywords', []))
    proxy_file = config.get('proxy_file', '')
    proxy_db = config.get('mysql_proxy_db', '')

    # when no search engine is specified, use google
    search_engines = config.get('search_engines', [
        'google',
    ])
    if not isinstance(search_engines, list):
        if search_engines == '*':
            search_engines = config.get('supported_search_engines')
        else:
            search_engines = search_engines.split(',')

    assert isinstance(search_engines,
                      list), 'Search engines must be a list like data type!'
    search_engines = set(search_engines)

    num_search_engines = len(search_engines)
    num_workers = int(config.get('num_workers'))
    scrape_method = config.get('scrape_method')
    pages = int(config.get('num_pages_for_keyword', 1))
    method = config.get('scrape_method', 'http')

    if config.get('shell', False):
        namespace = {}
        session_cls = get_session(config, scoped=False)
        namespace['session'] = session_cls()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        namespace['Proxy'] = GoogleScraper.database.Proxy
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        print('Proxy - Proxies stored for scraping projects.')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        # Just print the help.
        get_command_line(True)
        print(
            'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
            'keyword with --keyword.')
        return

    cache_manager = CacheManager(config)

    if config.get('fix_cache_names'):
        cache_manager.fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [
        keyword,
    ] if keyword else keywords
    scrape_jobs = {}
    if kwfile:
        if not os.path.exists(kwfile):
            raise WrongConfigurationError(
                'The keyword file {} does not exist.'.format(kwfile))
        else:
            if kwfile.endswith('.py'):
                # we need to import the variable "scrape_jobs" from the module.
                sys.path.append(os.path.dirname(kwfile))
                try:
                    modname = os.path.split(kwfile)[-1].rstrip('.py')
                    scrape_jobs = getattr(
                        __import__(modname, fromlist=['scrape_jobs']),
                        'scrape_jobs')
                except ImportError as e:
                    logger.warning(e)
            else:
                # Clean the keywords of duplicates right in the beginning
                keywords = set([
                    line.strip() for line in open(
                        kwfile, 'r', encoding='utf8').read().split('\n')
                    if line.strip()
                ])

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords,
                                                       search_engines,
                                                       scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if config.get('clean_cache_files', False):
        cache_manager.clean_cachefiles()
        return

    if config.get('check_oto', False):
        cache_manager._caching_is_one_to_one(keyword)

    if config.get('num_results_per_page') > 100:
        raise WrongConfigurationError(
            'Not more that 100 results per page available for searches.')

    proxies = []

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if config.get('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise Exception(
            'No proxies available and using own IP is prohibited by configuration. Turning down.'
        )

    valid_search_types = ('normal', 'video', 'news', 'image')
    if config.get('search_type') not in valid_search_types:
        raise WrongConfigurationError(
            'Invalid search type! Select one of {}'.format(
                repr(valid_search_types)))

    if config.get('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have:'
        )
        logger.info(
            'Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'
            .format(len(keywords), int(config.get('num_results_per_page', 0)),
                    int(config.get('num_pages_for_keyword'))))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(
            len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([
                    proxy.host + ':' + proxy.port for proxy in proxies if proxy
                ])))

        logger.info('By using {} mode with {} worker instances'.format(
            config.get('scrape_method'), int(config.get('num_workers'))))
        return

    # get a scoped sqlalchemy session
    session_cls = get_session(config, scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(config, session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None
    if kwfile and config.get('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=kwfile,
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

    # First of all, lets see how many requests remain to issue after searching the cache.
    if config.get('do_caching'):
        scrape_jobs = cache_manager.parse_all_cached_files(
            scrape_jobs, session, scraper_search)

    if scrape_jobs:

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        logger.info(
            'Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'
            .format(num_keywords=len(list(scrape_jobs)),
                    num_proxies=len(proxies),
                    num_threads=num_search_engines))

        progress_thread = None

        # Let the games begin
        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(config,
                                                cache_manager=cache_manager,
                                                mode=method,
                                                proxy=proxy,
                                                search_engine=search_engine,
                                                session=session,
                                                db_lock=db_lock,
                                                cache_lock=cache_lock,
                                                scraper_search=scraper_search,
                                                captcha_lock=captcha_lock,
                                                progress_queue=q,
                                                browser_num=num_worker))

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')
            progress_thread.join()

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(config,
                                             scrape_jobs,
                                             cache_manager=cache_manager,
                                             session=session,
                                             scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise Exception('No such scrape_method {}'.format(
                config.get('scrape_method')))

    from GoogleScraper.output_converter import close_outfile
    close_outfile()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search
Exemplo n.º 4
0
def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
        config_from_dict: Configuration that is passed when GoogleScraper is called as library.
    Returns:
        A database session to the results when return_results is True. Else, nothing.
    """
    external_config_file_path = cmd_line_args = None

    if parse_cmd_line:
        cmd_line_args = get_command_line()

        if cmd_line_args.get('config_file', None):
            external_config_file_path = os.path.abspath(cmd_line_args.get('config_file'))

    config = get_config(cmd_line_args, external_config_file_path, config_from_dict)

    if isinstance(config['log_level'], int):
        config['log_level'] = logging.getLevelName(config['log_level'])

    setup_logger(level=config.get('log_level').upper())

    if config.get('view_config', False):
        print(open(os.path.join(get_base_path(), 'scrape_config.py')).read())
        return

    if config.get('version'):
        from GoogleScraper.version import __version__
        print(__version__)
        return

    if config.get('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(config.get('cachedir')))
        except:
            pass
        return

    init_outfile(config, force_reload=True)

    kwfile = config.get('keyword_file', '')
    if kwfile:
        kwfile = os.path.abspath(kwfile)

    keyword = config.get('keyword')
    keywords = set(config.get('keywords', []))
    proxy_file = config.get('proxy_file', '')
    proxy_db = config.get('mysql_proxy_db', '')
    proxy_list = config.get('proxy_list', [])

    # when no search engine is specified, use google
    search_engines = config.get('search_engines', ['google',])
    if not isinstance(search_engines, list):
        if search_engines == '*':
            search_engines = config.get('supported_search_engines')
        else:
            search_engines = search_engines.split(',')

    assert isinstance(search_engines, list), 'Search engines must be a list like data type!'
    search_engines = set(search_engines)

    num_search_engines = len(search_engines)
    num_workers = int(config.get('num_workers'))
    scrape_method = config.get('scrape_method')
    pages = int(config.get('num_pages_for_keyword', 1))
    method = config.get('scrape_method', 'http')

    if config.get('shell', False):
        namespace = {}
        session_cls = get_session(config, scoped=False)
        namespace['session'] = session_cls()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        namespace['Proxy'] = GoogleScraper.database.Proxy
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        print('Proxy - Proxies stored for scraping projects.')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        # Just print the help.
        get_command_line(True)
        print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
            'keyword with --keyword.')
        return

    cache_manager = CacheManager(config)

    if config.get('fix_cache_names'):
        cache_manager.fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [keyword, ] if keyword else keywords
    scrape_jobs = {}
    if kwfile:
        if not os.path.exists(kwfile):
            raise WrongConfigurationError('The keyword file {} does not exist.'.format(kwfile))
        else:
            if kwfile.endswith('.py'):
                # we need to import the variable "scrape_jobs" from the module.
                sys.path.append(os.path.dirname(kwfile))
                try:
                    modname = os.path.split(kwfile)[-1].rstrip('.py')
                    scrape_jobs = getattr(__import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs')
                except ImportError as e:
                    logger.warning(e)
            else:
                # Clean the keywords of duplicates right in the beginning
                keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n') if line.strip()])

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if config.get('clean_cache_files', False):
        cache_manager.clean_cachefiles()
        return

    if config.get('check_oto', False):
        cache_manager._caching_is_one_to_one(keyword)

    if config.get('num_results_per_page') > 100:
        raise WrongConfigurationError('Not more that 100 results per page available for searches.')

    proxies = []

    if proxy_list:
        proxies = proxy_list
    elif proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if config.get('use_own_ip'):
        proxies.append(None)     

    if not proxies:
        raise Exception('No proxies available and using own IP is prohibited by configuration. Turning down.')

    valid_search_types = ('normal', 'video', 'news', 'image')
    if config.get('search_type') not in valid_search_types:
        raise WrongConfigurationError('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    if config.get('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info('If GoogleScraper would have been run without the --simulate flag, it would have:')
        logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format(
            len(keywords), int(config.get('num_results_per_page', 0)),
            int(config.get('num_pages_for_keyword'))))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([proxy.host + ':' + proxy.port for proxy in proxies if proxy])))

        logger.info('By using {} mode with {} worker instances'.format(config.get('scrape_method'),
                                                                       int(config.get('num_workers'))))
        return

    # get a scoped sqlalchemy session
    session_cls = get_session(config, scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(config, session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None
    if kwfile and config.get('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=kwfile,
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines)
        )

    # First of all, lets see how many requests remain to issue after searching the cache.
    if config.get('do_caching'):
        scrape_jobs = cache_manager.parse_all_cached_files(scrape_jobs, session, scraper_search)

    if scrape_jobs:

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        logger.info('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'.format(
            num_keywords=len(list(scrape_jobs)),
            num_proxies=len(proxies),
            num_threads=num_search_engines))

        progress_thread = None

        # Let the games begin
        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(
                                config,
                                cache_manager=cache_manager,
                                mode=method,
                                proxy=proxy,
                                search_engine=search_engine,
                                session=session,
                                db_lock=db_lock,
                                cache_lock=cache_lock,
                                scraper_search=scraper_search,
                                captcha_lock=captcha_lock,
                                progress_queue=q,
                                browser_num=num_worker
                            )
                        )

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')
            progress_thread.join()

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(config, scrape_jobs, cache_manager=cache_manager, session=session, scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise Exception('No such scrape_method {}'.format(config.get('scrape_method')))

    from GoogleScraper.output_converter import close_outfile
    close_outfile()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search
Exemplo n.º 5
0
def main():
    """Runs the GoogleScraper application as determined by the various configuration points."""
    global Config
    Config = get_config(True, True)

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise ValueError('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~')

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [keyword,] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise ValueError('The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise ValueError('Not more that 100 results per page available for Google searches.')

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)
    else:
        proxies = []

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        ValueError('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        conn = maybe_create_db()
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords

        if Config['GLOBAL'].getboolean('simulate'):
            # TODO: implement simulation
            raise NotImplementedError('Simulating is not implemented yet!')

        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[kw, ] for kw in remaining]

        # Distribute the proxies evenly on the kws to search
        scrapejobs = []
        Q = queue.Queue()
        proxies.append(None) if Config['SCRAPING'].getboolean('use_own_ip') else None
        if not proxies:
            logger.info("No ip's available for scanning.")

        chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()
        conn.close()
    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError('Sorry. Currently deep_scrape is not implemented.')

        else:
            results = []
            for kw in keywords:
                r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http')
                results.append(r)
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0), view=Config['HTTP'].get('view', False))
    else:
        raise ValueError('No such scrapemethod. Use "http" or "sel"')
Exemplo n.º 6
0
    if ie.name == 'socks':
        sys.exit('socks is not installed. Try this one: https://github.com/Anorov/PySocks')

    print(ie)
    sys.exit('You can install missing modules with `pip3 install [modulename]`')

import GoogleScraper.socks as socks
from GoogleScraper.caching import get_cached, cache_results, cached_file_name
from GoogleScraper.config import get_config
from GoogleScraper.parsing import GoogleParser
import GoogleScraper.google_search_params
import webbrowser
import tempfile

logger = logging.getLogger('GoogleScraper')
Config = get_config()

class GoogleSearchError(Exception):
    pass

class InvalidNumberResultsException(GoogleSearchError):
    pass

class MaliciousRequestDetected(GoogleSearchError):
    pass

def timer_support(Class):
    """In python versions prior to 3.3, threading.Timer
    seems to be a function that returns an instance
    of _Timer which is the class we want.
    """
Exemplo n.º 7
0
def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
        config_from_dict: Configuration that is passed when GoogleScraper is called as library.
    Returns:
        A database session to the results when return_results is True. Else, nothing.
    """

    external_config_file_path = cmd_line_args = None

    if parse_cmd_line:
        cmd_line_args = get_command_line()

        if cmd_line_args.get('config_file', None):
            external_config_file_path = os.path.abspath(
                cmd_line_args.get('config_file'))

    config = get_config(cmd_line_args, external_config_file_path,
                        config_from_dict)

    keywords = config.get('keywords')
    kwfile = config.get('keyword_file', None)

    if isinstance(config['log_level'], int):
        config['log_level'] = logging.getLevelName(config['log_level'])

    setup_logger(level=config.get('log_level').upper())

    if config.get('view_config', False):
        print(open(os.path.join(get_base_path(), 'scrape_config.py')).read())
        return

    if config.get('version'):
        from GoogleScraper.version import __version__
        print(__version__)
        return

    if config.get('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(config.get('cachedir')))
        except:
            pass
        return

    init_outfile(config, force_reload=True)  # in output_converter.py

    proxy_file = config.get('proxy_file', '')
    proxy_db = config.get('mysql_proxy_db', '')

    setup_shell_config(config)
    search_engines = get_search_engines(
        config.get('search_engines', ['google']),
        config.get('supported_search_engines'))

    num_search_engines = len(search_engines)
    num_workers = int(config.get('num_workers'))
    scrape_method = config.get('scrape_method')
    pages = int(config.get('num_pages_for_keyword', 1))
    method = config.get('scrape_method', 'http')

    cache_manager = CacheManager(config)
    if config.get('fix_cache_names'):
        cache_manager.fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    scrape_jobs = {}

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords,
                                                       search_engines,
                                                       scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if config.get('clean_cache_files', False):
        cache_manager.clean_cachefiles()
        return

    if config.get('check_oto', False):
        cache_manager._caching_is_one_to_one(keyword)

    if config.get('num_results_per_page') > 100:
        raise WrongConfigurationError(
            'Not more that 100 results per page available for searches.')

    proxies = []

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if config.get('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise Exception(
            'No proxies available and using own IP is prohibited by configuration. Turning down.'
        )

    if config.get('search_type') not in VALID_SEARCH_TYPES:
        raise WrongConfigurationError(
            'Invalid search type! Select one of {}'.format(
                repr(VALID_SEARCH_TYPES)))

    if config.get('simulate', False):
        run_simulation(config.get('num_results_per_page', 0),
                       config.get('num_pages_for_keyword'),
                       config.get('num_workers'))

    # get a scoped sqlalchemy session
    session_cls = get_session(config, scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(config, session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None

    if kwfile and config.get('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=kwfile,
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

    # First of all, lets see how many requests remain to issue after searching the cache.
    if config.get('do_caching'):
        scrape_jobs = cache_manager.parse_all_cached_files(
            scrape_jobs, session, scraper_search)

    if scrape_jobs:
        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        logger.info(
            'Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'
            .format(num_keywords=len(list(scrape_jobs)),
                    num_proxies=len(proxies),
                    num_threads=num_search_engines))

        progress_thread = None

        # Let the games begin

        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(config,
                                                cache_manager=cache_manager,
                                                mode=method,
                                                proxy=proxy,
                                                search_engine=search_engine,
                                                session=session,
                                                db_lock=db_lock,
                                                cache_lock=cache_lock,
                                                scraper_search=scraper_search,
                                                captcha_lock=captcha_lock,
                                                progress_queue=q,
                                                browser_num=num_worker))

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            progress_thread.join()

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(config,
                                             scrape_jobs,
                                             cache_manager=cache_manager,
                                             session=session,
                                             scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise Exception('No such scrape_method {}'.format(
                config.get('scrape_method')))

    from GoogleScraper.output_converter import close_outfile
    close_outfile()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search
Exemplo n.º 8
0
    from selenium.common.exceptions import TimeoutException, WebDriverException
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait  # available since 2.4.0
    from selenium.webdriver.support import expected_conditions as EC  # available since 2.26.0
except ImportError as ie:
    if hasattr(ie, 'name') and ie.name == 'bs4' or hasattr(ie, 'args') and 'bs4' in str(ie):
        sys.exit('Install bs4 with the command "sudo pip3 install beautifulsoup4"')
    if ie.name == 'socks':
        sys.exit('socks is not installed. Try this one: https://github.com/Anorov/PySocks')

    print(ie)
    sys.exit('You can install missing modules with `pip3 install [modulename]`')

logger = logging.getLogger('GoogleScraper')
Config = get_config(False)

def deep_scrape(query):
    """Launches many different Google searches with different parameter combinations to maximize return of results. Depth first.

    This is the heart of core.py. The aim of deep_scrape is to maximize the number of links for a given keyword.
    In order to achieve this goal, we'll heavily combine and rearrange a predefined set of search parameters to force Google to
    yield unique links:
    - different date-ranges for the keyword
    - using synonyms of the keyword
    - search for by different reading levels
    - search by different target languages (and translate the query in this language)
    - diversify results by scraping image search, news search, web search (the normal one) and maybe video search...

    From the google reference:
    When the Google Search Appliance filters results, the top 1000 most relevant URLs are found before the
Exemplo n.º 9
0
# -*- coding: utf-8 -*-
"""
I switched my motto; instead of saying "f**k tomorrow"
That buck that bought a bottle could've struck the lotto.
"""

__author__ = 'Nikolai Tschacher'
__updated__ = '14.11.2014'  # day.month.year
__home__ = 'incolumitas.com'

from GoogleScraper.proxies import Proxy
from GoogleScraper.config import get_config
from GoogleScraper.log import setup_logger
import logging
"""
All objects imported here are exposed as the public API of GoogleScraper
"""

from GoogleScraper.core import scrape_with_config
from GoogleScraper.scraping import GoogleSearchError

Config = get_config()
setup_logger()
Exemplo n.º 10
0
# -*- coding: utf-8 -*-

"""
I switched my motto; instead of saying "f**k tomorrow"
That buck that bought a bottle could've struck the lotto.
"""

__author__ = 'Nikolai Tschacher'
__updated__ = '22.01.2015'  # day.month.year
__home__ = 'incolumitas.com'

from GoogleScraper.proxies import Proxy
from GoogleScraper.config import get_config
from GoogleScraper.log import setup_logger

"""
All objects imported here are exposed as the public API of GoogleScraper
"""

from GoogleScraper.core import scrape_with_config
from GoogleScraper.scraping import GoogleSearchError, MaliciousRequestDetected

Config = get_config(parse_command_line=False)
setup_logger()
Exemplo n.º 11
0
                    if self.cache_manager:
                        self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                      scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query)

                        if self.scraper_search:
                            self.scraper_search.serps.append(serp)

                        if self.session:
                            self.session.add(serp)
                            self.session.commit()

                        store_serp_result(serp, self.config)


if __name__ == '__main__':
    from GoogleScraper.config import get_config
    from GoogleScraper.scrape_jobs import default_scrape_jobs_for_keywords

    some_words = get_some_words(n=1)

    cfg = get_config()
    scrape_jobs = list(default_scrape_jobs_for_keywords(some_words, ['bing'], 'http-async', 1))

    manager = AsyncScrapeScheduler(cfg, scrape_jobs)
    manager.run()

Exemplo n.º 12
0
so it is enforced that only minimally stable versions are online.

When testing single functions:

python -m pytest tests/functional_tests.py::GoogleScraperFunctionalTestCase::test_google_with_phantomjs_and_json_output
"""

import csv
import json
import tempfile
import os
import unittest
from GoogleScraper import scrape_with_config
from GoogleScraper.config import get_config

base_config = get_config()
all_search_engines = base_config['supported_search_engines']


def is_string_and_longer_than(s, n):
    return isinstance(s, str) and len(s) > n


def predicate_true_at_least_n_times(pred, collection, n, key):
    """
    Ensures that the predicate is at least n times true for
    items in a collection with the key.
    """
    if hasattr(collection[0], key):
        assert len(
            [getattr(v, key) for v in collection if pred(getattr(v, key))]) > n
Exemplo n.º 13
0
def main(return_results=True, force_reload=True, proxies=[]):
    """Runs the GoogleScraper application as determined by the various configuration points.

    Keyword arguments:
    return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results.
    """
    global Config
    Config = get_config(True, force_reload)

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise InvalidConfigurationException('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~')

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [keyword, ] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise InvalidConfigurationException('Not more that 100 results per page available for Google searches.')

    if not proxies:
        # look for proxies in mysql database or a proxy file if not given as keyword argument
        if proxy_db:
            proxies = get_proxies_from_mysql_db(proxy_db)
        elif proxy_file:
            proxies = parse_proxy_file(proxy_file)

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    # Create a sqlite database to store the results
    conn = maybe_create_db()
    if Config['GLOBAL'].getboolean('simulate'):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info('If GoogleScraper would have been run without the --simulate flag, it would have')
        logger.info('Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'.format(
            len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_of_pages')))
        logger.info('Used {} distinct proxies in total, with the following ip addresses: {}'.format(
            len(proxies), '\t\t\n'.join(proxies)
        ))
        if Config['SCRAPING'].get('scrapemethod') == 'sel':
            mode = 'selenium mode with {} browser instances'.format(Config['SELENIUM'].getint('num_browser_instances'))
        else:
            mode = 'http mode'
        logger.info('By using {}'.format(mode))
        sys.exit(0)

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords


        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[kw, ] for kw in remaining]

        # Distribute the proxies evenly on the keywords to search for
        scrapejobs = []
        Q = queue.Queue()

        if Config['SCRAPING'].getboolean('use_own_ip'):
            proxies.append(None)
        elif not proxies:
            raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.")

        chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()

        if return_results:
            return conn
        else:
            conn.close()

    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        results = []
        cursor = conn.cursor()
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError('Sorry. Currently deep scrape is not implemented.')
        else:
            for i, kw in enumerate(keywords):
                r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http')

                if r:
                    cursor.execute('INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)',
                                 (i, datetime.datetime.utcnow(), 0, 0, kw))
                    serp_id = cursor.lastrowid
                    for result in r:
                        for result_set in ('results', 'ads_main', 'ads_aside'):
                            if result_set in result.keys():
                                for title, snippet, url, pos in result[result_set]:
                                    cursor.execute('INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)',
                                        (title, snippet, url.geturl(), url.netloc, pos, serp_id))
                results.append(r)
            cursor.close()
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0))
        return conn
    else:
        raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"')
Exemplo n.º 14
0
Arquivo: core.py Projeto: csrgxtu/gps
def main(return_results=True, force_reload=True, proxies=[]):
    """Runs the GoogleScraper application as determined by the various configuration points.

    Keyword arguments:
    return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results.
    """
    global Config
    Config = get_config(True, force_reload)

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise InvalidConfigurationException(
            'You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~'
        )

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [
        keyword,
    ] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise InvalidConfigurationException(
                'The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([
                line.strip() for line in open(kwfile, 'r').read().split('\n')
            ])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise InvalidConfigurationException(
            'Not more that 100 results per page available for Google searches.'
        )

    if not proxies:
        # look for proxies in mysql database or a proxy file if not given as keyword argument
        if proxy_db:
            proxies = get_proxies_from_mysql_db(proxy_db)
        elif proxy_file:
            proxies = parse_proxy_file(proxy_file)

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        InvalidConfigurationException(
            'Invalid search type! Select one of {}'.format(
                repr(valid_search_types)))

    # Create a sqlite database to store the results
    conn = maybe_create_db()
    if Config['GLOBAL'].getboolean('simulate'):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have'
        )
        logger.info(
            'Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'
            .format(len(keywords),
                    Config['SCRAPING'].getint('num_results_per_page', 0),
                    Config['SCRAPING'].getint('num_of_pages')))
        logger.info(
            'Used {} distinct proxies in total, with the following ip addresses: {}'
            .format(len(proxies), '\t\t\n'.join(proxies)))
        if Config['SCRAPING'].get('scrapemethod') == 'sel':
            mode = 'selenium mode with {} browser instances'.format(
                Config['SELENIUM'].getint('num_browser_instances'))
        else:
            mode = 'http mode'
        logger.info('By using {}'.format(mode))
        sys.exit(0)

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(
                keywords,
                conn,
                simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords

        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining,
                               len(remaining) // max_sel_browsers,
                               fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[
                kw,
            ] for kw in remaining]

        # Distribute the proxies evenly on the keywords to search for
        scrapejobs = []
        Q = queue.Queue()

        if Config['SCRAPING'].getboolean('use_own_ip'):
            proxies.append(None)
        elif not proxies:
            raise InvalidConfigurationException(
                "No proxies available and using own IP is prohibited by configuration. Turning down."
            )

        chunks_per_proxy = math.ceil(len(kwgroups) / len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(
                SelScraper(chunk,
                           rlock,
                           Q,
                           captcha_lock=lock,
                           browser_num=i,
                           proxy=proxies[i // chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()

        if return_results:
            return conn
        else:
            conn.close()

    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        results = []
        cursor = conn.cursor()
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError(
                'Sorry. Currently deep scrape is not implemented.')
        else:
            for i, kw in enumerate(keywords):
                r = scrape(kw,
                           num_results_per_page=Config['SCRAPING'].getint(
                               'num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1),
                           scrapemethod='http')

                if r:
                    cursor.execute(
                        'INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)',
                        (i, datetime.datetime.utcnow(), 0, 0, kw))
                    serp_id = cursor.lastrowid
                    for result in r:
                        for result_set in ('results', 'ads_main', 'ads_aside'):
                            if result_set in result.keys():
                                for title, snippet, url, pos in result[
                                        result_set]:
                                    cursor.execute(
                                        'INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)',
                                        (title, snippet, url.geturl(),
                                         url.netloc, pos, serp_id))
                results.append(r)
            cursor.close()
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results,
                                      Config['GLOBAL'].getint('verbosity', 0))
        return conn
    else:
        raise InvalidConfigurationException(
            'No such scrapemethod. Use "http" or "sel"')
Exemplo n.º 15
0
Arquivo: core.py Projeto: csrgxtu/gps
except ImportError as ie:
    if hasattr(ie, 'name') and ie.name == 'bs4' or hasattr(
            ie, 'args') and 'bs4' in str(ie):
        sys.exit(
            'Install bs4 with the command "sudo pip3 install beautifulsoup4"')
    if ie.name == 'socks':
        sys.exit(
            'socks is not installed. Try this one: https://github.com/Anorov/PySocks'
        )

    print(ie)
    sys.exit(
        'You can install missing modules with `pip3 install [modulename]`')

logger = logging.getLogger('GoogleScraper')
Config = get_config(False)


def deep_scrape(query):
    """Launches many different Google searches with different parameter combinations to maximize return of results. Depth first.

    This is the heart of core.py. The aim of deep_scrape is to maximize the number of links for a given keyword.
    In order to achieve this goal, we'll heavily combine and rearrange a predefined set of search parameters to force Google to
    yield unique links:
    - different date-ranges for the keyword
    - using synonyms of the keyword
    - search for by different reading levels
    - search by different target languages (and translate the query in this language)
    - diversify results by scraping image search, news search, web search (the normal one) and maybe video search...

    From the google reference: