예제 #1
0
def restrict_review_langs(hosts, listings, reviews, guests):

    # Reduced Reviews langs
    print('Removing reviews based on Language Restrictions')

    agg_restrict = reviews[(reviews['google_langs'] == reviews['langdetect_langs']) & (reviews['google_langs'] == 'en') & (reviews['google_langs_conf'] > 0.9) & (reviews['langdetect_langs_conf'] > 0.9)]
    print("-Reviews that have 'English' language detection agreements between 'langdetect' and googletrans = %d" % len(agg_restrict))

    unk_restrict = reviews[(reviews['google_langs'] == 'unk') & (reviews['langdetect_langs'] == 'en') & (reviews['langdetect_langs_conf'] >= 0.99)]
    print("-Reviews that contained emoticons in them preventing 'googletrans' from correct detection = %d" % len(unk_restrict))

    new_reviews = pd.concat([agg_restrict, unk_restrict]).drop_duplicates(subset=None, inplace=False)
    print("-Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews)))

    # Reduced Hosts
    new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    # Reduced Listings
    new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    # Reduced Overall Guests
    overall_guests = reviews['reviewer_id'].unique()
    new_overall_guests = new_reviews['reviewer_id'].unique()
    print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests)))

    # Reduced Retrieved Guests
    new_retrieved_guests = guests[guests['id'].isin(new_overall_guests.astype(str))]
    print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests)))

    return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
예제 #2
0
def restrict_by_people_pic(hpic_tbl, gpic_tbl, hosts, listings, reviews, guests):

    # Reduced Hosts
    new_host_ids = set(utils.convert_to_str(hpic_tbl[hpic_tbl['num_of_people_in_pic'] == 1]['id'].unique()))

    # Reduced Guests
    new_guest_ids = set(utils.convert_to_str(gpic_tbl[gpic_tbl['num_of_people'] == 1]['id'].unique()))

    # Reduced Reviews
    new_reviews = reviews[(reviews['recipient_id'].isin(new_host_ids)) & (reviews['reviewer_id'].isin(new_guest_ids))]
    print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews)))

    # Reduced Hosts
    new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id']))]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    # Reduced Listings
    new_listings = listings[listings['id'].isin(set(new_reviews['listing_id']))]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    # Reduced Overall Guests
    overall_guests = reviews['reviewer_id'].unique()
    new_overall_guests = new_reviews['reviewer_id'].unique()
    print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests)))

    # Reduced Retrieved Guests
    new_retrieved_guests = guests[guests['id'].isin(set(utils.convert_to_str(new_overall_guests)))]
    print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests)))

    return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
예제 #3
0
def restrict_review_length(hosts, listings, reviews, guests):

    # Reduced Reviews 1
    print('Removing empty('', None, nan) reviews')
    nempty_reviews = reviews[(reviews['comments'] != 'None') & (reviews['comments'] != '') & (reviews['comments'] != 'nan') & (~reviews['comments'].isnull())]
    print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(nempty_reviews), utils.get_decreased_percent(nempty_reviews, reviews)))

    # Reduced Reviews 2
    print('Removing reviews less than 5 words')
    new_reviews = nempty_reviews[nempty_reviews['token_len'] >= 5]
    print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, nempty_reviews)))

    # Reduced Hosts
    new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    # Reduced Listings
    new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    # Reduced Overall Guests
    overall_guests = reviews['reviewer_id'].unique()
    new_overall_guests = new_reviews['reviewer_id'].unique()
    print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests)))

    # Reduced Retrieved Guests
    new_retrieved_guests = guests[guests['id'].isin(new_overall_guests.astype(str))]
    print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests)))

    return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
예제 #4
0
def remove_cancellations(hosts, listings, reviews, guests):

    # Remove Cancellations
    print('Removing cancellation notifications from reviews')
    new_reviews = reviews[reviews['hostCancelled'] == 'N']
    print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews)))

    # Reduced Hosts
    new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    # Reduced Listings
    new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    # Reduced Overall Guests
    overall_guests = reviews['reviewer_id'].unique()
    new_overall_guests = new_reviews['reviewer_id'].unique()
    print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests)))

    # Reduced Retrieved Guests
    new_retrieved_guests = guests[guests['id'].isin(new_overall_guests)]
    print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests)))

    return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
예제 #5
0
def restrict_number_of_reviews(hosts, listings, reviews, guests):

    # Reduced Hosts
    host_review_count = reviews.groupby('recipient_id').count()[['id']].rename(columns={'id': 'num_of_reviews'})
    new_host_ids = set([str(i) for i in host_review_count[host_review_count['num_of_reviews'] >4].index])

    # Reduced Hosts
    new_hosts = hosts[hosts['id'].isin(new_host_ids)]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    # Reduced Listings
    new_listings = listings[listings['host_id'].isin(new_host_ids)]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    # Reduced Reviews
    new_reviews = reviews[reviews['recipient_id'].isin(new_host_ids)]
    print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews)))

    # Reduced Overall Guests
    overall_guests = reviews['reviewer_id'].unique()
    new_overall_guests = new_reviews['reviewer_id'].unique()
    print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests)))

    # Reduced Retrieved Guests
    new_retrieved_guests = guests[guests['id'].isin(set(utils.convert_to_str(new_overall_guests)))]
    print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests)))

    return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)
예제 #6
0
def remove_no_reviews(hosts, listings, reviews):

    # Remove Listings with no reviews
    print('Remove Listings with no reviews')
    listings_with_reviews = reviews['listing_id'].unique()
    new_listings = listings[listings['id'].isin(listings_with_reviews.astype(str))]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    # Remove Hosts with no reviews
    print('Remove Hosts with no reviews')
    hosts_with_reviews = reviews['recipient_id'].unique()
    new_hosts = hosts[hosts['id'].isin(hosts_with_reviews.astype(str))]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    return (new_hosts, new_listings)
예제 #7
0
def restrict_by_received_guests(hosts, listings, reviews, guests):

    # Reduced Reviews
    print('Restrict to only reviews from guests whose profile we have')
    new_reviews = reviews[(reviews['reviewer_id'].isin(guests['id'].unique())) & (reviews['recipient_id'].isin(hosts['id'].unique()))]
    print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews)))

    # Reduced Hosts
    new_hosts = hosts[hosts['id'].isin(set(new_reviews['recipient_id'].astype(str)))]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    # Reduced Guests
    new_guests = guests[guests['id'].isin(set(new_reviews['reviewer_id'].astype(str)))]
    print("Revised number of Guests: %d (decreased %.2f %%)" % (len(new_guests), utils.get_decreased_percent(new_guests, guests)))

    # Reduced Listings
    new_listings = listings[listings['id'].isin(set(new_reviews['listing_id'].astype(str)))]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    return (new_hosts, new_guests, new_listings, new_reviews)
예제 #8
0
def restrict_multiple_listings(hosts, listings, reviews, guests):

    # Reduced Hosts
    new_hosts = hosts[(hosts['calculated_listings_count'] == 1) & (hosts['listings_count'] == 1)]
    print("Revised number of Hosts: %d (decreased %.2f %%)" % (len(new_hosts), utils.get_decreased_percent(new_hosts, hosts)))

    # Reduced Listings
    new_listings = listings[listings['host_id'].isin(set(new_hosts['id'].astype(str)))]
    print("Revised number of Listings: %d (decreased %.2f %%)" % (len(new_listings), utils.get_decreased_percent(new_listings, listings)))

    # Reduced Reviews
    new_reviews = reviews[reviews['recipient_id'].isin(set(new_hosts['id'].astype(str)))]
    print("Revised number of Reviews: %d (decreased %.2f %%)" % (len(new_reviews), utils.get_decreased_percent(new_reviews, reviews)))

    # Reduced Overall Guests
    overall_guests = reviews['reviewer_id'].unique()
    new_overall_guests = new_reviews['reviewer_id'].unique()
    print("Revised number of Overall Guests: %d (decreased %.2f %%)" % (len(new_overall_guests), utils.get_decreased_percent(new_overall_guests, overall_guests)))

    # Reduced Retrieved Guests
    new_retrieved_guests = guests[guests['id'].isin(new_overall_guests)]
    print("Revised number of Retrieved Guests: %d (decreased %.2f %%)" % (len(new_retrieved_guests), utils.get_decreased_percent(new_retrieved_guests, guests)))

    return (new_hosts, new_listings, new_reviews, new_overall_guests, new_retrieved_guests)