Exemplo n.º 1
0
def assign_keywords_to_scrapers(all_keywords):
    """Scrapers are often threads or asynchronous objects.

    Splitting the keywords equally on the workers is crucial
    for maximal performance.

    Args:
        all_keywords: All keywords to scrape

    Returns:
        A list of list. The inner list should be assigned to individual scrapers.
    """
    mode = Config['SCRAPING'].get('scrapemethod')


    if mode == 'sel':
        num_scrapers = Config['SELENIUM'].getint('num_browser_instances', 1)
    elif mode == 'http':
        num_scrapers = Config['HTTP'].getint('num_threads', 1)
    else:
        num_scrapers = 0

    if len(all_keywords) > num_scrapers:
        kwgroups = grouper(all_keywords, len(all_keywords)//num_scrapers, fillvalue=None)
    else:
        # thats a little special there :)
        kwgroups = [[kw, ] for kw in all_keywords]

    return kwgroups
Exemplo n.º 2
0
def assign_keywords_to_scrapers(all_keywords):
    """Scrapers are often threads or asynchronous objects.

    Splitting the keywords equally on the workers is crucial
    for maximal performance.

    Args:
        all_keywords: All keywords to scrape

    Returns:
        A list of list. The inner list should be assigned to individual scrapers.
    """
    mode = Config['SCRAPING'].get('scrapemethod')

    if mode == 'sel':
        num_scrapers = Config['SELENIUM'].getint('num_browser_instances', 1)
    elif mode == 'http':
        num_scrapers = Config['HTTP'].getint('num_threads', 1)
    else:
        num_scrapers = 0

    if len(all_keywords) > num_scrapers:
        kwgroups = grouper(all_keywords,
                           len(all_keywords) // num_scrapers,
                           fillvalue=None)
    else:
        # thats a little special there :)
        kwgroups = [[
            kw,
        ] for kw in all_keywords]

    return kwgroups
Exemplo n.º 3
0
def main():
    """Runs the GoogleScraper application as determined by the various configuration points."""
    global Config
    Config = get_config(True, True)

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise ValueError('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~')

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [keyword,] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise ValueError('The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise ValueError('Not more that 100 results per page available for Google searches.')

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)
    else:
        proxies = []

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        ValueError('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        conn = maybe_create_db()
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords

        if Config['GLOBAL'].getboolean('simulate'):
            # TODO: implement simulation
            raise NotImplementedError('Simulating is not implemented yet!')

        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[kw, ] for kw in remaining]

        # Distribute the proxies evenly on the kws to search
        scrapejobs = []
        Q = queue.Queue()
        proxies.append(None) if Config['SCRAPING'].getboolean('use_own_ip') else None
        if not proxies:
            logger.info("No ip's available for scanning.")

        chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()
        conn.close()
    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError('Sorry. Currently deep_scrape is not implemented.')

        else:
            results = []
            for kw in keywords:
                r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http')
                results.append(r)
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0), view=Config['HTTP'].get('view', False))
    else:
        raise ValueError('No such scrapemethod. Use "http" or "sel"')
Exemplo n.º 4
0
def main(return_results=True, force_reload=False, proxies=[]):
    """Runs the GoogleScraper application as determined by the various configuration points.

    Keyword arguments:
    return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results.
    """
    parse_cmd_args()

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise InvalidConfigurationException('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~')

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [keyword, ] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise InvalidConfigurationException('Not more that 100 results per page available for Google searches.')

    if not proxies:
        # look for proxies in mysql database or a proxy file if not given as keyword argument
        if proxy_db:
            proxies = get_proxies_from_mysql_db(proxy_db)
        elif proxy_file:
            proxies = parse_proxy_file(proxy_file)

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    # Create a sqlite database to store the results
    conn = maybe_create_db()
    if Config['GLOBAL'].getboolean('simulate'):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info('If GoogleScraper would have been run without the --simulate flag, it would have')
        logger.info('Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'.format(
            len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_of_pages')))
        logger.info('Used {} distinct proxies in total, with the following ip addresses: {}'.format(
            len(proxies), '\t\t\n'.join(proxies)
        ))
        if Config['SCRAPING'].get('scrapemethod') == 'sel':
            mode = 'selenium mode with {} browser instances'.format(Config['SELENIUM'].getint('num_browser_instances'))
        else:
            mode = 'http mode'
        logger.info('By using {}'.format(mode))
        sys.exit(0)

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords


        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[kw, ] for kw in remaining]

        # Distribute the proxies evenly on the keywords to search for
        scrapejobs = []
        Q = queue.Queue()

        if Config['SCRAPING'].getboolean('use_own_ip'):
            proxies.append(None)
        elif not proxies:
            raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.")

        chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()

        if return_results:
            return conn
        else:
            conn.close()

    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        results = []
        cursor = conn.cursor()
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError('Sorry. Currently deep scrape is not implemented.')
        else:
            for i, kw in enumerate(keywords):
                r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http')

                if r:
                    cursor.execute('INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)',
                                 (i, datetime.datetime.utcnow(), 0, 0, kw))
                    serp_id = cursor.lastrowid
                    for result in r:
                        for result_set in ('results', 'ads_main', 'ads_aside'):
                            if result_set in result.keys():
                                for title, snippet, url, pos in result[result_set]:
                                    cursor.execute('INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)',
                                        (title, snippet, url.geturl(), url.netloc, pos, serp_id))
                results.append(r)
            cursor.close()
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0))
        return conn
    else:
        raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"')
Exemplo n.º 5
0
Arquivo: core.py Projeto: csrgxtu/gps
def main(return_results=True, force_reload=True, proxies=[]):
    """Runs the GoogleScraper application as determined by the various configuration points.

    Keyword arguments:
    return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results.
    """
    global Config
    Config = get_config(True, force_reload)

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise InvalidConfigurationException(
            'You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~'
        )

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [
        keyword,
    ] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise InvalidConfigurationException(
                'The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([
                line.strip() for line in open(kwfile, 'r').read().split('\n')
            ])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise InvalidConfigurationException(
            'Not more that 100 results per page available for Google searches.'
        )

    if not proxies:
        # look for proxies in mysql database or a proxy file if not given as keyword argument
        if proxy_db:
            proxies = get_proxies_from_mysql_db(proxy_db)
        elif proxy_file:
            proxies = parse_proxy_file(proxy_file)

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        InvalidConfigurationException(
            'Invalid search type! Select one of {}'.format(
                repr(valid_search_types)))

    # Create a sqlite database to store the results
    conn = maybe_create_db()
    if Config['GLOBAL'].getboolean('simulate'):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have'
        )
        logger.info(
            'Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'
            .format(len(keywords),
                    Config['SCRAPING'].getint('num_results_per_page', 0),
                    Config['SCRAPING'].getint('num_of_pages')))
        logger.info(
            'Used {} distinct proxies in total, with the following ip addresses: {}'
            .format(len(proxies), '\t\t\n'.join(proxies)))
        if Config['SCRAPING'].get('scrapemethod') == 'sel':
            mode = 'selenium mode with {} browser instances'.format(
                Config['SELENIUM'].getint('num_browser_instances'))
        else:
            mode = 'http mode'
        logger.info('By using {}'.format(mode))
        sys.exit(0)

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(
                keywords,
                conn,
                simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords

        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining,
                               len(remaining) // max_sel_browsers,
                               fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[
                kw,
            ] for kw in remaining]

        # Distribute the proxies evenly on the keywords to search for
        scrapejobs = []
        Q = queue.Queue()

        if Config['SCRAPING'].getboolean('use_own_ip'):
            proxies.append(None)
        elif not proxies:
            raise InvalidConfigurationException(
                "No proxies available and using own IP is prohibited by configuration. Turning down."
            )

        chunks_per_proxy = math.ceil(len(kwgroups) / len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(
                SelScraper(chunk,
                           rlock,
                           Q,
                           captcha_lock=lock,
                           browser_num=i,
                           proxy=proxies[i // chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()

        if return_results:
            return conn
        else:
            conn.close()

    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        results = []
        cursor = conn.cursor()
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError(
                'Sorry. Currently deep scrape is not implemented.')
        else:
            for i, kw in enumerate(keywords):
                r = scrape(kw,
                           num_results_per_page=Config['SCRAPING'].getint(
                               'num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1),
                           scrapemethod='http')

                if r:
                    cursor.execute(
                        'INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)',
                        (i, datetime.datetime.utcnow(), 0, 0, kw))
                    serp_id = cursor.lastrowid
                    for result in r:
                        for result_set in ('results', 'ads_main', 'ads_aside'):
                            if result_set in result.keys():
                                for title, snippet, url, pos in result[
                                        result_set]:
                                    cursor.execute(
                                        'INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)',
                                        (title, snippet, url.geturl(),
                                         url.netloc, pos, serp_id))
                results.append(r)
            cursor.close()
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results,
                                      Config['GLOBAL'].getint('verbosity', 0))
        return conn
    else:
        raise InvalidConfigurationException(
            'No such scrapemethod. Use "http" or "sel"')