def splinter_scrape_ta_reviews(city='', state='', write_to_db=False, start_num=0, end_num=-1): """PURPOSE: To """ engine = cadb.connect_aws_db(write_unicode=True) blinks = get_hotel_urls(city, state, engine) # only do the specified hotel range if start_num != 0: blinks = blinks[start_num:] if end_num != -1: if len(blinks) < end_num: print('end_num exceeded number of hotels. resetting to max.') end_num = len(blinks) blinks = blinks[:end_num] br = Browser() donebids = get_done_business_ids(city, engine) for hotel_id, biz_id, link in blinks: # check to see if there are already reviews for that hotel if int(biz_id) not in donebids: bigdf = scrape_hotel(link, br, engine) bigdf['hotel_id'] = hotel_id bigdf['business_id'] = biz_id bigdf['biz_review_id'] = np.int64(bigdf['biz_review_id'].values) bigdf = remove_duplicates(bigdf, city, engine) if write_to_db: try: bigdf.to_sql('ta_reviews', engine, if_exists='append', index=False) except: print('WRITING TO DB FAILED!!!') else: print('business_id {} already scraped.'.format(biz_id))
def retrieve_best_hotels(city, state=''): """PURPOSE: To """ engine = cadb.connect_aws_db(write_unicode=True) conn = engine.connect() cmd = 'SELECT * FROM yelp_reviews' yelp_reviews = pd.read_sql(cmd, engine) cmd = 'SELECT * FROM yelp_hotels' yelp_hotels = pd.read_sql(cmd, engine) yelp = pd.merge(yelp_hotels, yelp_reviews, on='business_id', how='inner') yelp_city = yelp[yelp['hotel_city'] == city.strip()] yelp_dog_review = yelp_city[yelp_city['review_text'].str.contains('dog')].copy().reset_index() average_dog_ratings = [np.mean(yelp_dog_review[yelp_dog_review['hotel_id'] == hotel_id]['review_rating'].values) for hotel_id in np.unique(yelp_dog_review['hotel_id'])] unique_hotels = yelp_dog_review[yelp_dog_review['hotel_id'].isin(np.unique(yelp_dog_review['hotel_id']))].copy() unique_hotels.drop_duplicates(cols='hotel_id', inplace=True) unique_hotels['average_rating'] = average_dog_ratings best_dog_hotel_names = unique_hotels.sort(columns='average_rating', ascending=False)['hotel_name'].head(10).values best_dog_hotel_ratings = np.round(unique_hotels.sort(columns='average_rating', ascending=False)['average_rating'].head(10).values, 1) string_ratings = [str(rat) for rat in best_dog_hotel_ratings] #print('best dog hotels:') #print(best_dog_hotel_names) return best_dog_hotel_names, string_ratings
def get_biz_ids(city, engine): cmd = "SELECT business_id FROM ta_hotels " cmd += "where hotel_city = " cmd += '"' + (" ").join(city.split("_")) + '"' try: xstng_bizs = [int(biz_id[0]) for biz_id in pd.read_sql_query(cmd, engine).values] except: engine = cadb.connect_aws_db(write_unicode=True) xstng_bizs = [int(biz_id[0]) for biz_id in pd.read_sql_query(cmd, engine).values] return xstng_bizs
def get_biz_review_ids(city, engine): cmd = 'select biz_review_id from ta_reviews r inner join ' cmd += 'ta_hotels h on r.business_id=h.business_id ' cmd += 'where h.hotel_city = ' cmd += '"'+(' ').join(city.split('_'))+'"' try: xstng_revs = [int(rev_id[0]) for rev_id in pd.read_sql_query(cmd, engine).values] except: engine = cadb.connect_aws_db(write_unicode=True) xstng_revs = [int(rev_id[0]) for rev_id in pd.read_sql_query(cmd, engine).values] return xstng_revs
def retrieve_best_hotels3(city, state='', revdb='ta', max_revs=10): """ PURPOSE: To retrieve the reviews for all hotels for a given city, compute the hotels ratings, and return the data for the top ten hotels for that city. """ # convert city input in all lower case city = str(city).lower() engine = cadb.connect_aws_db(write_unicode=True) if revdb == 'yelp': revdf = get_yelp_reviews(city, engine) if revdb == 'ta': revdf = get_ta_reviews(city, engine) # get the unique hotels unique_biz_ids = np.unique(revdf['business_id']) unique_hotel_df = revdf[revdf['business_id'].isin(unique_biz_ids)].copy() unique_hotel_df.drop_duplicates(subset='business_id', inplace=True) unique_hotel_df['num_dog_reviews'] = [len(revdf[revdf['business_id'] == business_id]) for business_id in unique_hotel_df['business_id'].values] #max_revs = unique_hotel_df['num_dog_reviews'].max() print('max reviews is: {}'.format(max_revs)) unique_hotel_df['average_dog_rating'] = [np.mean(revdf[revdf['business_id'] == business_id]['dog_rating'].values) * 0.8 + min(1, unique_hotel_df[unique_hotel_df['business_id'] == business_id]['num_dog_reviews'].values[0]/max_revs) for business_id in unique_hotel_df['business_id'].values] # print('dog ratings are...') # print(unique_hotel_df['average_dog_rating'].values) uniq_df_srtd = unique_hotel_df.sort_values(by='average_dog_rating', ascending=False).head(10).copy() best_dog_hotel_names = uniq_df_srtd['hotel_name'].values best_dog_hotel_ratings = np.round(uniq_df_srtd['average_dog_rating'].values, 1) string_ratings = [str(rat) for rat in best_dog_hotel_ratings] best_dog_hotel_imgs = uniq_df_srtd['hotel_img_url'].values best_dog_hotel_urls = uniq_df_srtd['hotel_url'].values best_dog_hotel_prices = [str(prc) if prc > 0 else "nan" for prc in np.int64(np.round(uniq_df_srtd['hotel_price'].values))] # print('best dog hotel prices') # print(best_dog_hotel_prices) #print('best dog hotels:') #print(best_dog_hotel_names) return best_dog_hotel_names, string_ratings, best_dog_hotel_imgs, best_dog_hotel_urls, best_dog_hotel_prices
def classify_review_type(traindb="yelp", ntrain_dog_revs=1500, ntrain_gen_revs=1500, classdb="ta", verbose=False): """ PURPOSE: To classify reviews as being either pet-related or general. :param traindb: The data set to use for training. The default is Yelp. :param ntrain_dog_revs: The number of dog-related hotel reviews to use in the training. :param ntrain_gen_revs: The number of general hotel reviews to use in the training. :param classdb: The data set to classify (either yelp or ta) :param verbose: Set this to True to print progress shit. """ engine = cadb.connect_aws_db(write_unicode=True) if verbose: print("grabbing bf data...") # get the bringfido reviews bfdf = get_bf_reviews(engine) if verbose: print("grabbing general review data...") if traindb == "ta": print("using TA data for training...") gentraindf = get_ta_reviews(engine) if traindb == "yelp": print("using Yelp data for training...") gentraindf = get_yelp_reviews(engine) train_data = np.hstack( (bfdf["review_text"].values[:ntrain_dog_revs], gentraindf["review_text"].values[:ntrain_gen_revs]) ) labels = ["dog"] * ntrain_dog_revs labels.extend(["general"] * ntrain_gen_revs) y_train = labels if verbose: print("vectorizing...") t0 = time() vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english") X_train = vectorizer.fit_transform(train_data) duration = time() - t0 print("vectorized in {:.2f} seconds.".format(duration)) penalty = "l2" clf = LinearSVC(loss="l2", penalty=penalty, dual=False, tol=1e-3) if verbose: print("training model...") clf.fit(X_train, y_train) if classdb == "yelp": print("categorizing Yelp data...") classdf = get_yelp_reviews(engine, remove_shorts=False) if classdb == "ta": print("categorizing TA data...") classdf = get_ta_reviews(engine, remove_shorts=False) X_yrevs = vectorizer.transform(classdf["review_text"].values) if verbose: print("predicting...") pred = clf.predict(X_yrevs) classdf["review_category"] = pred update_table_rev_cat(classdf, engine)
def splinter_scrape_ta_hotels(city_url="", city="new_haven", state="ct", write_to_db=False, max_pages=20): """PURPOSE: To """ # this only needs to be done at the very beginning br = Browser() if city_url == "": city_url = get_city(city.lower() + "_" + state.lower()) print("using the following url:") print("{}".format(city_url)) # city_url = "http://www.tripadvisor.com/Hotels-g31310-Phoenix_Arizona-Hotels.html" ##################################################### # do not edit below this line ##################################################### # more_pages is used to keep track if there is more # than one page of hotel results for the given city more_pages = True # scraping will start on page 1 of the hotel results page = 1 # open the URL in a browser object: br.visit(city_url) # find the div to enter the date range. This is needed to get pricing info: date_bar = br.find_by_xpath('//*[contains(@class, "meta_date_wrapper")]') # find the check in calendar span: cin_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field check_in")]/span')[0] # now click the check_in span to activate it cin_btn.click() # select the right calendar div (next month) rightcal = br.find_by_xpath('//div[contains(@class, "month")]')[1] # now select the third Friday of next month as the check in date fri_btn = rightcal.find_by_xpath("table/tbody/tr[3]/td[6]/div") # and click it fri_btn.click() # now choose the next day (saturday) as the check out date cout_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field check_out")]/span')[0] cout_btn.click() leftcal = br.find_by_xpath('//div[contains(@class, "month")]')[0] sat_btn = leftcal.find_by_xpath("table/tbody/tr[3]/td[7]/div") sat_btn.click() print("Dates selected.") # wait a few seconds for ta to retrieve prices time.sleep(5) # get the city and state info loclist = br.find_by_xpath('//*[contains(@id, "BREADCRUMBS")]') locstring = loclist.text.split(u"\u203a") hotel_city = city = locstring[2].lower() hotel_state = re.findall("\w+ \(([A-Z][A-Z])\)", locstring[1])[0].lower() # create a pandas dataframe that will be used for writing # the results to the DB: columns = [ "hotel_id", "hotel_url", "hotel_img_url", "hotel_name", "hotel_address", "hotel_city", "hotel_state", "hotel_rating", "hotel_latitude", "hotel_longitude", "hotel_price", "business_id", "review_count", "dog_review_count", ] bigdf = pd.DataFrame(columns=columns) # create some lists to fill w. the results from each page hotel_names = [] links = [] img_url = [] hotel_price = [] business_id = [] print("starting scraper loop.") while more_pages and page <= max_pages: print("*" * 75) print("Now scraping page {} of {} of the hotel results".format(page, max_pages)) print("*" * 75) # get all the review divs print("waiting a few seconds before scraping...") time.sleep(np.random.uniform(8, 20)) listing_div = br.find_by_xpath('//*[contains(@class, "hotels_lf_condensed")]') xsts1 = br.is_element_present_by_xpath('//*[contains(@class, "photo_booking")]', wait_time=1) xsts2 = br.is_element_present_by_xpath('//*[contains(@class, "property_details")]', wait_time=1) xsts3 = br.is_element_present_by_xpath( '//*[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]', wait_time=1, ) while len(listing_div) < 1 or not xsts1 or not xsts2 or not xsts3: print("now waiting for DOIs to return") time.sleep(5) listing_div = br.find_by_xpath('//*[contains(@class, "hotels_lf_condensed")]') xsts1 = br.is_element_present_by_xpath('//*[contains(@class, "photo_booking")]', wait_time=1) xsts2 = br.is_element_present_by_xpath('//*[contains(@class, "property_details")]', wait_time=1) xsts3 = br.is_element_present_by_xpath( '//*[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]', wait_time=1, ) print("# of listings: {}".format(len(listing_div))) print("photo_booking exists: {}".format(xsts1)) print("property_details exists: {}".format(xsts2)) print("prw_up exists: {}".format(xsts3)) print("Number of hotel listings on this page: {}".format(len(listing_div))) df = pd.DataFrame(columns=columns) for listing in listing_div: try: biz_id = re.findall("hotel_(\d+)", listing["id"]) if len(biz_id) > 0: biz_id = biz_id[0] else: biz_id = None print("business_id: {}".format(biz_id)) business_id.append(biz_id) except: print("!" * 80) print("biz_id DOES NOT EXIST!") print("!" * 80) business_id.append(None) try: prop = listing.find_by_xpath('div/div/div/div[contains(@class, "property_details")]') except: print("!" * 80) print("prop DIV DOES NOT EXIST!") print("!" * 80) try: title = prop.find_by_xpath('div/div[@class="listing_title"]') print(title.text) hotel_names.append(title.text) except: print("!" * 80) print("TITLE DIV DOES NOT EXIST!") print("!" * 80) hotel_names.append(None) try: hotel_link = title.find_by_xpath("a")["href"] print(hotel_link) links.append(hotel_link) except: print("!" * 80) print("hotel_link DOES NOT EXIST!") print("!" * 80) links.append(None) try: hotel_img = prop.find_by_xpath('div[@class="photo_booking"]/div/div/a/img')["src"] print("Hotel img URL: {}".format(hotel_img)) img_url.append(hotel_img) except: print("!" * 80) print("hotel_img DIV DOES NOT EXIST!") print("!" * 80) img_url.append(None) try: price_text = prop.find_by_xpath( 'div[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]' ).text price = re.findall("(\d+)", price_text)[0] print("Price: ${}".format(price)) hotel_price.append(price) except: print("!" * 80) print("price DIV DOES NOT EXIST!") print("!" * 80) hotel_price.append(None) print("*" * 50) if len(hotel_names) > 0: print("len of hotel_names: {}".format(len(hotel_names))) print("len of hotel_price: {}".format(len(hotel_price))) print("len of img_url: {}".format(len(img_url))) print("len of business_id: {}".format(len(business_id))) print("len of hotel_city: {}".format(len(hotel_city))) print("len of hotel_state: {}".format(len(hotel_state))) df["hotel_name"] = hotel_names df["hotel_price"] = hotel_price df["hotel_img_url"] = img_url df["hotel_url"] = links df["business_id"] = business_id df["hotel_city"] = hotel_city df["hotel_state"] = hotel_state bigdf = bigdf.append(df) # update the page number page += 1 # if more pages are desired, look for a "next" button if page <= max_pages: nxt_btn = br.find_by_xpath( '//div[contains(@class, "deckTools")]/div[contains(@class, "unified")]/a[contains(@class, "next")]' ) # if there is a next button, click it # else exit the while loop if len(nxt_btn) > 0: nxt_btn.click() else: more_pages = False if write_to_db: engine = cadb.connect_aws_db(write_unicode=True) bigdf = remove_ad_hotels(bigdf) remove_duplicate_hotels(bigdf, city, engine) bigdf.to_sql("ta_hotels", engine, if_exists="append", index=False)
def retrieve_best_hotels2(city, state='', revdb='ta', max_revs=10): """PURPOSE: To """ city = str(city).lower() engine = cadb.connect_aws_db(write_unicode=True) # create and train the model to use to classify ratings: clf, vectorizer = create_and_train_model(engine) if revdb == 'yelp': revdf = get_yelp_reviews(city, engine) if revdb == 'ta': revdf = get_ta_reviews(city, engine) review_text = revdf['review_text'].values print('Number of reviews: {}'.format(len(review_text))) t0 = time() X_pred = vectorizer.transform(review_text) duration = time() - t0 print('transformed test data in {:.2f} seconds.'.format(duration)) #yelp_city = yelp[yelp['hotel_city'] == city.strip()] #yelp_dog_review = yelp_city[yelp_city['review_text'].str.contains('dog')].copy().reset_index() #average_dog_ratings = [np.mean(yelp_dog_review[yelp_dog_review['hotel_id'] == hotel_id]['review_rating'].values) for hotel_id in np.unique(yelp_dog_review['hotel_id'])] #now predict the rating based on the sentiment of the review: y_pred = clf.predict(X_pred) revdf['dog_rating'] = y_pred # get the unique hotels unique_biz_ids = np.unique(revdf['business_id']) unique_hotel_df = revdf[revdf['business_id'].isin(unique_biz_ids)].copy() unique_hotel_df.drop_duplicates(cols='business_id', inplace=True) unique_hotel_df['num_dog_reviews'] = [len(revdf[revdf['business_id'] == business_id]) for business_id in unique_hotel_df['business_id'].values] #max_revs = unique_hotel_df['num_dog_reviews'].max() print('max reviews is: {}'.format(max_revs)) unique_hotel_df['average_dog_rating'] = [np.mean(revdf[revdf['business_id'] == business_id]['dog_rating'].values) * 0.8 + min(1, unique_hotel_df[unique_hotel_df['business_id'] == business_id]['num_dog_reviews'].values[0]/max_revs) for business_id in unique_hotel_df['business_id'].values] print('dog ratings are...') print(unique_hotel_df['average_dog_rating'].values) uniq_df_srtd = unique_hotel_df.sort(columns='average_dog_rating', ascending=False).head(10).copy() best_dog_hotel_names = uniq_df_srtd['hotel_name'].values best_dog_hotel_ratings = np.round(uniq_df_srtd['average_dog_rating'].values, 1) string_ratings = [str(rat) for rat in best_dog_hotel_ratings] best_dog_hotel_imgs = uniq_df_srtd['hotel_img_url'].values best_dog_hotel_urls = uniq_df_srtd['hotel_url'].values best_dog_hotel_prices = [str(prc) if prc > 0 else "nan" for prc in np.int64(np.round(uniq_df_srtd['hotel_price'].values))] print('best dog hotel prices') print(best_dog_hotel_prices) #print('best dog hotels:') #print(best_dog_hotel_names) return best_dog_hotel_names, string_ratings, best_dog_hotel_imgs, best_dog_hotel_urls, best_dog_hotel_prices
def splinter_scrape_bf(city, state): """PURPOSE: To """ city = city.lower() state = state.lower() br = Browser() citystate_string = city+'_'+state+'_us/' url = 'http://www.bringfido.com/lodging/city/'+citystate_string br.visit(url) page = 1 npages = len(br.find_by_xpath('//*[@id="results_paging_controls_bottom"]/span')) columns = ['hotel_id', 'hotel_img_url', 'hotel_url', 'hotel_name', 'hotel_address', 'hotel_city', 'hotel_state', 'hotel_rating', 'hotel_latitude', 'hotel_longitude', 'review_count', 'hotel_address', 'business_id', 'review_id', 'user_id', 'username', 'review_title', 'review_text', 'review_rating', 'review_date'] bigdf = pd.DataFrame(columns=columns) while (page == 1 or page < npages): print('*'*70) print('Now on page {}'.format(page)) archive_links = br.find_by_xpath('//*[@id="results_list"]/div') hotel_names = [] text_summaries = [] links = [] biz_ids = [] hotel_img_urls = [] df = pd.DataFrame(columns=columns) texts = [] titles = [] authors = [] ratings = [] hnms = [] hiurls = [] bids = [] lnks = [] for lnk in archive_links: hotel_names.append(lnk.find_by_xpath('div[2]/h1/a').value) text_summaries.append(lnk.text) this_link = lnk.find_by_xpath('div/h1/a')['href'] links.append(this_link) hotel_img_urls.append(lnk.find_by_xpath('div/div[@class="photo_inner"]/a/img')['src']) biz_ids.append(lnk['id'].split('_')[-1]) for hotel_id, link in enumerate(links): print('*'*75) print('Now on {}: {}'.format(hotel_id, link)) print('*'*75) br.visit(link) # hotel_description = br.find_by_xpath('//*[@class="body"]').text # scrape the address details section of the page details = br.find_by_xpath('//*[@class="address"]').text.split('\n') # now get just the address: address = details[0] # and just the city, state, country, and zip code: csczip = details[1] # and just the phone number # phone = details[2] # now separate the city, state, and zip: city, state, zipcode = csczip.strip().split(',') zipcode = zipcode[3:] #Now using correct Xpath we are fetching URL of archives reviews = br.find_by_xpath('//*[@class="review_container"]') print(reviews) print('') for rev in reviews: titles.append(rev.find_by_xpath('div/div[1]').text) authors.append(rev.find_by_xpath('div/div[2]').text) texts.append(rev.find_by_xpath('div/div[3]').text) ratings.append(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1]) hnms.append(hotel_names[hotel_id]) hiurls.append(hotel_img_urls[hotel_id]) bids.append(biz_ids[hotel_id]) lnks.append(link) print(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1]) print('Number of new titles: {}'.format(len(titles))) print('Number of new ratings: {}'.format(len(ratings))) df['review_title'] = titles df['username'] = authors df['review_text'] = texts df['review_rating'] = ratings df['hotel_id'] = hotel_id df['hotel_name'] = hnms df['hotel_url'] = lnks df['hotel_img_url'] = hiurls df['hotel_address'] = address df['hotel_city'] = city df['hotel_state'] = state df['hotel_rating'] = np.mean([int(rat) for rat in ratings]) df['hotel_latitude'] = None df['hotel_longitude'] = None df['review_count'] = len(texts) df['review_id'] = 0 df['user_id'] = 0 df['business_id'] = bids print('new entries from this page: {}'.format(len(df))) bigdf = bigdf.append(df.copy()) page += 1 if page < npages: page_timeout = True while page_timeout: br.visit(url) time.sleep(1) print('Now scraping page {} of {}'.format(page, npages)) button = br.find_by_id('page_'+str(page)) print(button) if len(button) > 0: button.click() page_timeout = False bigdf_reviews = bigdf[['hotel_id', 'review_id', 'business_id', 'user_id', 'username', 'review_title', 'review_text', 'review_rating']].copy() bigdf_hotels = bigdf[['hotel_id', 'hotel_url', 'hotel_img_url', 'hotel_name', 'hotel_address', 'hotel_city', 'hotel_state', 'hotel_rating', 'hotel_latitude', 'hotel_longitude', 'business_id', 'review_count']].copy() bigdf_hotels.drop_duplicates(subset='business_id', inplace=True) bigdf_hotels['hotel_id'] = None bigdf_reviews['review_id'] = None print('Number of bf reviews to add: {}'.format(len(bigdf_reviews))) engine = cadb.connect_aws_db(write_unicode=True) bigdf_reviews.to_sql('bf_reviews', engine, if_exists='append', index=False) bigdf_hotels.to_sql('bf_hotels', engine, if_exists='append', index=False)