def restrict_review_langs(hosts, listings, reviews, guests): # Reduced Reviews langs print('Removing reviews based on Language Restrictions') agg_restrict = reviews[(reviews['google_langs'] == reviews['langdetect_langs']) & (reviews['google_langs'] == 'en') & (reviews['google_langs_conf'] > 0.9) & (reviews['langdetect_langs_conf'] > 0.9)] print("-Reviews that have 'English' language detection agreements between 'langdetect' and googletrans = %d" % len(agg_restrict)) unk_restrict = reviews[(reviews['google_langs'] == 'unk') & (reviews['langdetect_langs'] == 'en') & (reviews['langdetect_langs_conf'] >= 0.99)] print("-Reviews that contained emoticons in them preventing 'googletrans' from correct detection = %d" % len(unk_restrict)) new_reviews = pd.concat([agg_restrict, unk_restrict]).drop_duplicates(subset=None, inplace=False) print("-Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews))) # Reduced Hosts new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) # Reduced Listings new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) # Reduced Overall Guests overall_guests = reviews['reviewer_id'].unique() new_overall_guests = new_reviews['reviewer_id'].unique() print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests))) # Reduced Retrieved Guests new_retrieved_guests = guests[guests['id'].isin(new_overall_guests.astype(str))] print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests))) return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
def restrict_by_people_pic(hpic_tbl, gpic_tbl, hosts, listings, reviews, guests): # Reduced Hosts new_host_ids = set(utils.convert_to_str(hpic_tbl[hpic_tbl['num_of_people_in_pic'] == 1]['id'].unique())) # Reduced Guests new_guest_ids = set(utils.convert_to_str(gpic_tbl[gpic_tbl['num_of_people'] == 1]['id'].unique())) # Reduced Reviews new_reviews = reviews[(reviews['recipient_id'].isin(new_host_ids)) & (reviews['reviewer_id'].isin(new_guest_ids))] print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews))) # Reduced Hosts new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id']))] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) # Reduced Listings new_listings = listings[listings['id'].isin(set(new_reviews['listing_id']))] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) # Reduced Overall Guests overall_guests = reviews['reviewer_id'].unique() new_overall_guests = new_reviews['reviewer_id'].unique() print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests))) # Reduced Retrieved Guests new_retrieved_guests = guests[guests['id'].isin(set(utils.convert_to_str(new_overall_guests)))] print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests))) return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
def restrict_review_length(hosts, listings, reviews, guests): # Reduced Reviews 1 print('Removing empty('', None, nan) reviews') nempty_reviews = reviews[(reviews['comments'] != 'None') & (reviews['comments'] != '') & (reviews['comments'] != 'nan') & (~reviews['comments'].isnull())] print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(nempty_reviews), utils.get_decreased_percent(nempty_reviews, reviews))) # Reduced Reviews 2 print('Removing reviews less than 5 words') new_reviews = nempty_reviews[nempty_reviews['token_len'] >= 5] print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, nempty_reviews))) # Reduced Hosts new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) # Reduced Listings new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) # Reduced Overall Guests overall_guests = reviews['reviewer_id'].unique() new_overall_guests = new_reviews['reviewer_id'].unique() print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests))) # Reduced Retrieved Guests new_retrieved_guests = guests[guests['id'].isin(new_overall_guests.astype(str))] print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests))) return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
def remove_cancellations(hosts, listings, reviews, guests): # Remove Cancellations print('Removing cancellation notifications from reviews') new_reviews = reviews[reviews['hostCancelled'] == 'N'] print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews))) # Reduced Hosts new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) # Reduced Listings new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) # Reduced Overall Guests overall_guests = reviews['reviewer_id'].unique() new_overall_guests = new_reviews['reviewer_id'].unique() print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests))) # Reduced Retrieved Guests new_retrieved_guests = guests[guests['id'].isin(new_overall_guests)] print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests))) return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
def restrict_number_of_reviews(hosts, listings, reviews, guests): # Reduced Hosts host_review_count = reviews.groupby('recipient_id').count()[['id']].rename(columns={'id': 'num_of_reviews'}) new_host_ids = set([str(i) for i in host_review_count[host_review_count['num_of_reviews'] >4].index]) # Reduced Hosts new_hosts = hosts[hosts['id'].isin(new_host_ids)] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) # Reduced Listings new_listings = listings[listings['host_id'].isin(new_host_ids)] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) # Reduced Reviews new_reviews = reviews[reviews['recipient_id'].isin(new_host_ids)] print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews))) # Reduced Overall Guests overall_guests = reviews['reviewer_id'].unique() new_overall_guests = new_reviews['reviewer_id'].unique() print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests))) # Reduced Retrieved Guests new_retrieved_guests = guests[guests['id'].isin(set(utils.convert_to_str(new_overall_guests)))] print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests))) return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
def remove_no_reviews(hosts, listings, reviews): # Remove Listings with no reviews print('Remove Listings with no reviews') listings_with_reviews = reviews['listing_id'].unique() new_listings = listings[listings['id'].isin(listings_with_reviews.astype(str))] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) # Remove Hosts with no reviews print('Remove Hosts with no reviews') hosts_with_reviews = reviews['recipient_id'].unique() new_hosts = hosts[hosts['id'].isin(hosts_with_reviews.astype(str))] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) return (new_hosts, new_listings)
def restrict_by_received_guests(hosts, listings, reviews, guests): # Reduced Reviews print('Restrict to only reviews from guests whose profile we have') new_reviews = reviews[(reviews['reviewer_id'].isin(guests['id'].unique())) & (reviews['recipient_id'].isin(hosts['id'].unique()))] print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews))) # Reduced Hosts new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) # Reduced Guests new_guests = guests[guests['id'].isin(set(new_reviews['reviewer_id'].astype(str)))] print("Revised number of Guests: %d (decreased %.2f %%)" % (len(new_guests), utils.get_decreased_percent(new_guests, guests))) # Reduced Listings new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) return (new_hosts, new_guests, new_listings, new_reviews)
def restrict_multiple_listings(hosts, listings, reviews, guests): # Reduced Hosts new_hosts = hosts[(hosts['calculated_listings_count'] == 1) & (hosts['listings_count'] == 1)] print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts))) # Reduced Listings new_listings = listings[listings['host_id'].isin(set(new_hosts['id'].astype(str)))] print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings))) # Reduced Reviews new_reviews = reviews[reviews['recipient_id'].isin(set(new_hosts['id'].astype(str)))] print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews))) # Reduced Overall Guests overall_guests = reviews['reviewer_id'].unique() new_overall_guests = new_reviews['reviewer_id'].unique() print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests))) # Reduced Retrieved Guests new_retrieved_guests = guests[guests['id'].isin(new_overall_guests)] print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests))) return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)