示例#1
0
def scrape(cores, db_name):
    """
    INPUT:
        cores (int) - Number of cores to use for MP.

    OUTPUT:
        None.

    Wrapper function to begin scraping list of URLs gathered from Yelp API
    which have not yet been scraped. Calls scrape_func.
    """
    if ip_test(PROXY_IP):

        nyc_rest = YelpDatabase(database_name=db_name,
                                cat_filt="restaurants")
        df_yelp = nyc_rest.get_full_df()
        df_yelp_lim = df_yelp[df_yelp.review_count >= 20]

        all_sorted_urls = df_yelp_lim[['review_count', 'url']]\
            .sort('review_count', ascending=False)\
            .drop_duplicates('url').url.values

        yelp_scraped = set(mongo_connect(db_name, "rest_scrape")
                           .distinct('url'))

        urls = [x for x in all_sorted_urls if x not in yelp_scraped]

        print "Remaining: {0}".format(str(len(urls)))
        p = Pool(cores)
        p.map(scrape_func, urls)
    else:
        print "Connect to proxy!"
示例#2
0
 def __init__(self, database_name, cat_filt):
     YelpDatabase.__init__(self, database_name, cat_filt)