示例#1
0
文件: gsdl.py 项目: looran/gsdl
 def run(self):
     config = {
         'SCRAPING': {
             'use_own_ip': 'True',
             'keywords': '\n'.join(self.searches.values()),
             'num_of_pages': "%s" % self.num_pages,
             'scrapemethod': self.scrapemethod
         },
         'SELENIUM': {
             'sel_browser': 'chrome',
             'manual_captcha_solving': 'True',
             # 'sleeping_ranges': '5; 1, 2', # more agressive than defaults
         },
         'GLOBAL': {
             'do_caching': 'True',
             #'do_caching': 'False',
             #'cachedir': 'dc
             'db': "results_{asctime}.db",
             # 'debug': 'WARNING',
             'debug': 'ERROR',
         },
         'GOOGLE_SEARCH_PARAMS': {
             'start': "0",
             'num': "30",
         }
     }
     if self.proxyfile:
         print("Using proxies from %s" % self.proxyfile)
         config['GLOBAL']['proxy_file'] = self.proxyfile
     # GoogleScraper.config.update_config(config) # hack, GoogleScraper config 'db' path is broken when 2nd time
     db = GoogleScraper.scrape_with_config(config, usrcb_result=self.cb_results)
     urls = db.execute('SELECT * FROM link').fetchall()
     db.close()
     self.urls.extend(urls)
     return urls
示例#2
0
def scrape(keywords, search_engine : str, num_pages: int) -> object:
    scrape_results = []
    titles = []
    links = []
    snippets = []

    config = {
        'use_own_ip': True,
        'keywords': keywords,
        'search_engines': [search_engine],
        'num_pages_for_keyword': num_pages,
        'scrape_method': 'http',
        'sel_browser': 'chrome',
        'do_caching': False,
        'num_workers': 1
    }
    search = GoogleScraper.scrape_with_config(config)

    for serp in search.serps:
        print(serp)
        for link in serp.links:
            titles.append(link.title)
            links.append(link.link)
            snippets.append(str(link.snippet))
            """
            #print(link.title + '\n')
            #print(link.link + '\n')
            #print(link.snippet + '\n')
            """

    scrape_results.append(titles)
    scrape_results.append(links)
    scrape_results.append(snippets)
    return scrape_results
示例#3
0
    def search_via_googler(self, query_paraments):
        """
        args['config']['last_update']
            Applications: tbm=app
            Blogs: tbm=blg
            Books: tbm=bks
            Discussions: tbm=dsc
            Images: tbm=isch
            News: tbm=nws
            Patents: tbm=pts
            Places: tbm=plcs
            Recipes: tbm=rcp
            Shopping: tbm=shop
            Video: tbm=vid
        """
        def fix_urls(url):
            url = url.replace('/amp/', '') if '/amp/' in url else url
            url = url.replace('/amp.html', '') if '/amp.html' in url else url
            url = urllib.parse.urljoin('http://',
                                       url) if 'http://' not in url else url
            return url

        google_search_url = 'https://www.google.com/search?tbs=qdr:%s&'
        dateRestrict = query_paraments.get('dateRestrict', 'd')
        config = {
            'use_own_ip': 'True',
            'keywords': [query_paraments['q']],
            'google_search_url': google_search_url % dateRestrict,
            'num_results_per_page':
            query_paraments.get('results_per_page', 25),
            'num_pages_for_keyword': query_paraments.get('num_pages', 4),
            'num_workers': 2,
            'search_engines': [
                'google',
            ],
            'search_type': 'normal',
            'scrape_method': 'http',
            'do_caching': False,
            'print_results': None,
        }

        logger.debug('Making search with Googler lib with configuration')

        try:
            google_search = GoogleScraper.scrape_with_config(config)

            urls_without_fix = []
            urls = []
            for serp in google_search.serps:
                urls_without_fix = [r.link for r in serp.links]
                urls = [fix_urls(r.link) for r in serp.links]

            logger.debug(
                ('Google Search fixed links successfully extracted with '
                 'query "{}": {:d} links extracted').format(
                     query_paraments['q'], len(urls)))
            logger.debug(
                ('Google Search links without fix successfully extracted '
                 'with query "{}":\n{}').format(query_paraments['q'],
                                                urls_without_fix))
            logger.debug(('List of link extracted from Google Search with the '
                          'query "{}":\n{}').format(query_paraments['q'],
                                                    urls))

            return urls
        except GoogleScraper.GoogleSearchError as e:
            logger.error(str(e))