Exemplo n.º 1
0
def save_artist_image(aname, size):
    """Find and save an image for the artist with name `aname`."""
    # todo: save both images with one call
    config = serpscrap.Config()

    config.set('search_engines', ['googleimg'])
    config.set('pages_per_keyword', 1)
    config.set('screenshot', False)
    config.set('search_type', 'image')
    config.set('sleeping_max', 10)
    config.set('image_type', 'any')  # required -- has no effect
    config.set('image_size', 'l')  # required -- has no effect

    keywords = [aname]
    scrap = serpscrap.SerpScrap()
    scrap.init(keywords=keywords, config=config.get())
    results = scrap.run()

    # limit to first NUM_RES results
    if len(results) > NUM_RES:
        results = results[:NUM_RES]

    for result in results:
        url = urllib.parse.unquote(result['serp_url'])
        try:
            content = requests.get(url).content
            saved = save(content, size)
            if saved is not None:
                return saved
        except Exception as e:
            pass
    return None
Exemplo n.º 2
0
def get_related(config, keywords, related):
    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=keywords)
    scrap.run()
    results = scrap.get_related()
    for keyword in results:
        if keyword['keyword'] not in related:
            related.append(keyword['keyword'])
    return related
def google_scraper():
    keywords = ['chat.whatsapp.com/', 'chat.whatsapp.com/*', 'inurl:chat.whatsapp.com/', 'link:chat.whatsapp.com']
    config = serpscrap.Config()
    config.set('scrape_urls', True)
    config.set('num_pages_for_keyword', 100)  # 100 page per keyword
    config.set('num_results_per_page', 20)  # 20 pages per result
    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=keywords)
    scrap.as_csv('raw_rez')
Exemplo n.º 4
0
    def scrape_google_snippets(self, query='', file='info.txt'):
        f = open(file, 'a')

        keyword = [query]

        config = serpscrap.Config()
        config.set('scrape_urls', False)

        scrap = serpscrap.SerpScrap()
        scrap.init(config=config.get(), keywords=keyword)
        results = scrap.run()
        return results
Exemplo n.º 5
0
 def scrap(self):
     config = serpscrap.Config()
     config_new = {
         'cachedir': '/tmp/.serpscrap/',
         'clean_cache_after': 100,
         'database_name': '/tmp/serpscrap',
         'do_caching': True,
         'num_pages_for_keyword': 1,
         'scrape_urls': True,
         'search_engines': ['google'],
         'google_search_url': 'https://www.google.com/?gl=us&hl=en&pws=0&gws_rd=cr',
         'executable_path': '/tools/chromedriver',
     }
     config.apply(config_new)
     scrap = serpscrap.SerpScrap()
     scrap.init(config=config.get(), keywords=self.keywords)
     self.scrap = scrap.run()
Exemplo n.º 6
0
def get_scrapes(keyword):
    query = clean_bag_of_words_stop_words(keyword)
    query = ' '.join([item for sublist in query for item in sublist])
    keywords = query
    config = serpscrap.Config()
    config.set('scrape_urls', False)

    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=keywords)
    results = scrap.run()

    rrs = []
    for result in results:
        rrs.append(result)

    def strip(obj):
        return obj if obj is not None else ' '

    return ' '.join([strip(result['serp_snippet']) + strip(result['serp_title']) for result in results])
def get_google_links_snippets(query):
    """ retrieves top 10 results (for which snippets could be retrieved) along with URLs and snippets """
    config = serpscrap.Config()
    config.set('scrape_urls', False)
    config.set('do_caching', False)

    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=[query])
    results = scrap.scrap_serps()
    i = 0

    urls = []
    google_snippets = []

    for result in results:
        if result['serp_snippet'] and i < 10:
            urls.append(result['serp_url'])
            google_snippets.append(
                re.sub(r'[^\x00-\x7F]+', ' ',
                       result['serp_snippet']).replace('\n', ''))
            i += 1

    return urls, google_snippets
Exemplo n.º 8
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import pprint
import serpscrap

keywords = ['example']

config = serpscrap.Config()
config.set('scrape_urls', False)

scrap = serpscrap.SerpScrap()
scrap.init(config=config.get(), keywords=keywords)
results = scrap.run()

for result in results:
    pprint.pprint(result)
    print()
Exemplo n.º 9
0
def scrape(config, keywords):
    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=keywords)
    return scrap.run()
Exemplo n.º 10
0
def scrape_to_csv(config, keywords):
    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=keywords)
    return scrap.as_csv('/tmp/planet-earth')
Exemplo n.º 11
0
def scrape_to_csv(config, keywords):
    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=keywords)
    return scrap.as_csv('/tmp/cryptocurrency')
Exemplo n.º 12
0
def main(args):
    """main driver"""
    test = False
    dbname = './tmp/{}_{}_{}_{}'.format(
        NOW,
        args.comparison, args.num_locations, args.query_source)
    if args.query_source == 'trends':
        keyword_objs = from_trends_top_query_by_category()
    elif args.query_source == 'csv':
        keyword_objs = from_csv()
    elif args.query_source == 'test':
        test = True
        keyword_objs = [{
            'keyword': 'pizza',
            'category': args.query_source,
        }, {
            'keyword': 'coffee',
            'category': args.query_source,
        },{
            'keyword': 'trump',
            'category': args.query_source,
        }, {
            'keyword': 'football',
            'category': args.query_source,
        },]
    elif args.query_source in ['all', 'all6', 'extra']:
        keyword_objs = []
        if args.query_source in ['all', 'all6']:
            for query_source in ['procon_popular', 'trending', ]:
                keywords = CURATED[query_source]
                keyword_objs += [
                    {
                        'keyword': keyword,
                        'category': query_source,
                    } for keyword in keywords
                ]
            keyword_objs += CURATED['popular']
        if args.query_source in ['all6', 'extra']:
            for query_source in ['top_insurance', 'top_loans', 'med_sample_first_20', ]:
                keywords = CURATED[query_source] 
                keyword_objs += [
                    {
                        'keyword': keyword,
                        'category': query_source,
                    } for keyword in keywords
                ]
    elif args.query_source == 'expanded':
        keyword_objs = []
        keywords = CURATED['procon_a_to_z']
        keyword_objs += [
            {
                'keyword': keyword,
                'category': args.query_source,
            } for keyword in keywords
        ]
        keyword_objs += from_trends_top_query_by_category(15)
    else:
        keywords = CURATED[args.query_source]
        keyword_objs = [
            {
                'keyword': keyword,
                'category': args.query_source,
            } for keyword in keywords
        ]
    
    print(keyword_objs)

    config = serpscrap.Config()
    config.set('do_caching', False)

    if VERSION == 'chrome':
        config.set('sel_browser', 'chrome')
        config.set('executable_path', CHROME_PATH)
        config.set('chromedriver_log', CHROMEDRIVER_LOG)
    else:
        config.set('executable_path',
                   PHANT_PATH)

    # config.set('use_own_ip', False)
    # config.set('proxy_file', 'proxy.txt')
    config.set('num_pages_for_keyword', 1)
    config.set('num_results_per_page', 30)  # overshoots actual number of results per page
    config.set('screenshot', False)
    # config.set('mobile_emulation', True)
    print(dbname)
    config.set('database_name', dbname)
    config.set('save_html', True)
    config.set('use_control', False)
    location_df = load_locations()
    locations = []

    if args.comparison == 'test':
        locations.append({
            'engine': 'google',
            'latitude': 34.063,
            'longitude': -118.44,
            'urban_rural_code': 1,
            'median_income': 0,
            'percent_dem': 0,
            'population_estimate': 0,
            'name': 'almaden',
        })
    else:
        if args.comparison == 'urban-rural':
            subsets = [
                location_df[location_df[URBAN_RURAL_COL] == 1],
                location_df[location_df[URBAN_RURAL_COL] == 6],
            ]
        elif args.comparison == 'income' or args.comparison == 'voting':
            if args.comparison == 'income':
                sort_col = MEDIAN_INCOME_COL
            else:
                sort_col = VOTING_COL
            print('Going to sort by {}'.format(sort_col))
            location_df = location_df.sort_values(by=[sort_col])
            print(location_df)
            lower_set = location_df.head(args.num_locations)
            upper_set = location_df.tail(args.num_locations)
            subsets = [lower_set, upper_set]
        else:
            subsets = [location_df]
        for subset in subsets:
            if args.comparison == 'population_weighted':
                sample = subset.sample(
                    n=args.num_locations, weights=subset.POP_ESTIMATE_2016)
            sample = subset.sample(n=args.num_locations)
            for _, row in sample.iterrows():
                locations.append({
                    'engine': 'google',
                    'latitude': row.INTPTLAT,
                    'longitude': row.INTPTLONG,
                    'urban_rural_code': row[URBAN_RURAL_COL],
                    'median_income': row[MEDIAN_INCOME_COL],
                    'percent_dem': row[VOTING_COL],
                    'population_estimate': row[POPULATION_COL],
                    'name': row.NAME
                })
    pprint(locations)
    config.set('search_instances', locations)
    scrap = serpscrap.SerpScrap()
    scrap.init(config=config.get(), keywords=keyword_objs)
    a, b = len(keyword_objs), len(locations)
    estimated_time = round(a * b / 60, 2)
    if not test:
        yag = yagmail.SMTP('*****@*****.**', os.environ['MAILBOT_PASSWORD'])
        start_contents = """
            About to run! In total, {} keywords will be searched across {} locations.
            At a rate of ~1 SERP/min, this will take approximately {} hours.
            Keep in mind that going over 28 hours may result in a longer term IP ban.
            Arguments are {}.
            """.format(
                a, b, estimated_time, args
            )
        yag.send('*****@*****.**', 'Scrape starting', start_contents)

    try:
        scrap.run()
    except ValueError as err:
        new_dbname = 'take2' + dbname
        err_contents = ['Error: {}. Going to wait one hour and try again! Results will be in {}'.format(
            err, new_dbname )]
        if not test:
            yag = yagmail.SMTP('*****@*****.**', os.environ['MAILBOT_PASSWORD'])
            yag.send('*****@*****.**', 'Scrape starting', err_contents)
        time.sleep(3600)
        config.set('database_name', new_dbname)
        scrap2 = serpscrap.SerpScrap()
        scrap2.init(config=config.get(), keywords=keyword_objs)
        scrap2.run()

    if not test:
        end_contents = ['you-geo-see main.py finished running! Arguments were: {}'.format(args)]
        yag = yagmail.SMTP('*****@*****.**', os.environ['MAILBOT_PASSWORD'])
        yag.send('*****@*****.**', 'Scrape success', end_contents)