Python GoogleScraper.scrape_with_config示例

编程语言: Python

类/类型: GoogleScraper

方法/功能: scrape_with_config

hotexamples.com的示例: 3

Python GoogleScraper.scrape_with_config - 已找到3个示例。这些是从开源项目中提取的最受好评的GoogleScraper.scrape_with_config 来自程序包 google-image-scraper现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

scrape(6)

Proxy(2)

scrape_with_config(2)

GoogleScraper(1)

GoogleSearch(1)

parse_config(1)

run(1)

scrape_google(1)

setup_logger(1)

示例#1

显示文件

文件： gsdl.py 项目： looran/gsdl

 def run(self):
     config = {
         'SCRAPING': {
             'use_own_ip': 'True',
             'keywords': '\n'.join(self.searches.values()),
             'num_of_pages': "%s" % self.num_pages,
             'scrapemethod': self.scrapemethod
         },
         'SELENIUM': {
             'sel_browser': 'chrome',
             'manual_captcha_solving': 'True',
             # 'sleeping_ranges': '5; 1, 2', # more agressive than defaults
         },
         'GLOBAL': {
             'do_caching': 'True',
             #'do_caching': 'False',
             #'cachedir': 'dc
             'db': "results_{asctime}.db",
             # 'debug': 'WARNING',
             'debug': 'ERROR',
         },
         'GOOGLE_SEARCH_PARAMS': {
             'start': "0",
             'num': "30",
         }
     }
     if self.proxyfile:
         print("Using proxies from %s" % self.proxyfile)
         config['GLOBAL']['proxy_file'] = self.proxyfile
     # GoogleScraper.config.update_config(config) # hack, GoogleScraper config 'db' path is broken when 2nd time
     db = GoogleScraper.scrape_with_config(config, usrcb_result=self.cb_results)
     urls = db.execute('SELECT * FROM link').fetchall()
     db.close()
     self.urls.extend(urls)
     return urls

示例#2

显示文件

def scrape(keywords, search_engine : str, num_pages: int) -> object:
    scrape_results = []
    titles = []
    links = []
    snippets = []

    config = {
        'use_own_ip': True,
        'keywords': keywords,
        'search_engines': [search_engine],
        'num_pages_for_keyword': num_pages,
        'scrape_method': 'http',
        'sel_browser': 'chrome',
        'do_caching': False,
        'num_workers': 1
    }
    search = GoogleScraper.scrape_with_config(config)

    for serp in search.serps:
        print(serp)
        for link in serp.links:
            titles.append(link.title)
            links.append(link.link)
            snippets.append(str(link.snippet))
            """
            #print(link.title + '\n')
            #print(link.link + '\n')
            #print(link.snippet + '\n')
            """

    scrape_results.append(titles)
    scrape_results.append(links)
    scrape_results.append(snippets)
    return scrape_results

示例#3

显示文件

    def search_via_googler(self, query_paraments):
        """
        args['config']['last_update']
            Applications: tbm=app
            Blogs: tbm=blg
            Books: tbm=bks
            Discussions: tbm=dsc
            Images: tbm=isch
            News: tbm=nws
            Patents: tbm=pts
            Places: tbm=plcs
            Recipes: tbm=rcp
            Shopping: tbm=shop
            Video: tbm=vid
        """
        def fix_urls(url):
            url = url.replace('/amp/', '') if '/amp/' in url else url
            url = url.replace('/amp.html', '') if '/amp.html' in url else url
            url = urllib.parse.urljoin('http://',
                                       url) if 'http://' not in url else url
            return url

        google_search_url = 'https://www.google.com/search?tbs=qdr:%s&'
        dateRestrict = query_paraments.get('dateRestrict', 'd')
        config = {
            'use_own_ip': 'True',
            'keywords': [query_paraments['q']],
            'google_search_url': google_search_url % dateRestrict,
            'num_results_per_page':
            query_paraments.get('results_per_page', 25),
            'num_pages_for_keyword': query_paraments.get('num_pages', 4),
            'num_workers': 2,
            'search_engines': [
                'google',
            ],
            'search_type': 'normal',
            'scrape_method': 'http',
            'do_caching': False,
            'print_results': None,
        }

        logger.debug('Making search with Googler lib with configuration')

        try:
            google_search = GoogleScraper.scrape_with_config(config)

            urls_without_fix = []
            urls = []
            for serp in google_search.serps:
                urls_without_fix = [r.link for r in serp.links]
                urls = [fix_urls(r.link) for r in serp.links]

            logger.debug(
                ('Google Search fixed links successfully extracted with '
                 'query "{}": {:d} links extracted').format(
                     query_paraments['q'], len(urls)))
            logger.debug(
                ('Google Search links without fix successfully extracted '
                 'with query "{}":\n{}').format(query_paraments['q'],
                                                urls_without_fix))
            logger.debug(('List of link extracted from Google Search with the '
                          'query "{}":\n{}').format(query_paraments['q'],
                                                    urls))

            return urls
        except GoogleScraper.GoogleSearchError as e:
            logger.error(str(e))