def structure_records_total_counts(): data_thesis = database.database("Thesis") global records_total_counts, max_num #get location clusters location_clusters = data_thesis.get_data("SELECT `features_plain_gobig`.`listing_id`, `location_cluster`, `cluster` FROM `features_plain_gobig` INNER JOIN `listing_clusters_big_all` ON `features_plain_gobig`.`listing_id` = `listing_clusters_big_all`.`listing_id` WHERE location_cluster != -1") #trying go big for now #make structure for the pulled data #`listing_id`, `location_cluster`, `cluster` # # #structure for the final datastructure #location_cluster : { listing_cluster: { year: {day: percentage occupied}}} for year in ["2013", "2014", "2015", "2016"]: records_averages_full_year["ENQUIRY"][year] = {} records_averages_full_year["CANCELLED"][year] = {} for entry in location_clusters: #if the location cluster if entry[1] not in records_total_counts["ENQUIRY"]: for filename in ["ENQUIRY", "CANCELLED"]: records_total_counts[filename][entry[1]] = {} records_averages[filename][entry[1]] = {} for year in ["2013", "2014", "2015", "2016"]: records_averages_full_year[filename][year][entry[1]] = {} #if the listing_cluster if entry[2] not in records_total_counts["ENQUIRY"][entry[1]].keys(): for filename in ["ENQUIRY", "CANCELLED"]: records_total_counts[filename][entry[1]][entry[2]] = time.default_date_structure() records_averages[filename][entry[1]][entry[2]] = time.default_date_structure() for year in ["2013", "2014", "2015", "2016"]: records_averages_full_year[filename][year][entry[1]][entry[2]] = {} data_thesis.destroy_connection()
def insert_zeros(): global listing_dict #get earliest_dates (might as well): data_thesis = database.database("Thesis") earliest_dates = data_thesis.get_data("SELECT * FROM `earliest_date`") for entry in earliest_dates: #for every listing #fetch entry date data #all_dict: status, listing_id, year for status, all_data in all_dict.iteritems(): #entry[0] = listing_id try: for year, days in all_data[entry[0]].iteritems(): for day, occupancy in days.iteritems(): #within the year try: if int((datetime.datetime.strptime( day, "%Y-%m-%d").date() - entry[1]).days) > 0: #if the day is after the earliest recorded date of activity if occupancy is None: listing_dict[entry[0]][year][day] = 0 except Exception as e: pass #sys.exit() except Exception as e: #most likely listing has no data for this year pass data_thesis.destroy_connection()
def fill_cluster_data(): _fill_cluster_listings() global clusters_listings, clusters_listings_with_data print "Finished making getting the data structures ready. Time to find averages" ''' Now we begin to find the averages per day ''' data_thesis = database.database("Thesis") #find average for each day (pain in ass) for location_cluster, listing_cluster in clusters_listings.iteritems(): for listing_cluster_id, date_data in listing_cluster.iteritems(): for year, days_data in date_data.iteritems(): #day is a string? for this_day in days_data.keys(): #find the average query = "SELECT `listing_clusters_plain`.`listing_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_clusters_plain`.`listing_id` = `listing_locations_DBSCAN_final`.`listing_id` WHERE `label_id` = %s AND `cluster_id` = %s GROUP BY `listing_id`" occupancy_set = data_thesis.get_data( query % (location_cluster, listing_cluster_id)) #get average and put it in average = _get_average(occupancy_set, this_day) clusters_listings[location_cluster][listing_cluster_id][year][ this_day] = average if average: clusters_listings_with_data.append( {location_cluster: listing_cluster}) print "finished here's a sample" #print clusters_listings[1][len(clusters_listings[1].keys()) - 2] data_thesis.destroy_connection()
def organize_data(): thesis_data = database.database("Thesis") print "\nNow let's shape up our features" my_data = thesis_data.get_data("SELECT `all_data_better`.`listing_id`, property_title, property_type, size, description, room_description, bed_count, bathroom_boolean, bed_type, accomodates, label_id FROM `all_data_better` INNER JOIN `dbscan_labels_full` on `dbscan_labels_full`.`listing_id` = `all_data_better`.`listing_id` INNER JOIN `listings_accomodation` ON `listings_accomodation`.`listing_id` = `all_data_better`.`listing_id` WHERE label_id IN(0, 1, 19)") print "I have data! " print my_data[1] thesis_data.destroy_connection() final = {entry[0]:{"property_title": entry[1], "property_type": entry[2], "description": entry[4], "size": entry[3], "rooms": [], "beds": {}, "bathroom_count": 0, "accomodates": entry[9],"location": entry[9]} for entry in my_data} for entry in my_data: #my_data: listing_id, property_title, property_type, size, description, room_description, bed_count, bathroom_boolean, bed_type, accomodates, label_id (location) if entry[5] is not None and entry[5] > ' ': final[entry[0]]["rooms"].append(entry[5]) else: final[entry[0]]["rooms"].append("empty_content") #beds if entry[8] is not None and entry[8] > " ": if entry[8] not in final[entry[0]]["beds"].keys(): final[entry[0]]["beds"][entry[8]] = 1 else: final[entry[0]]["beds"][entry[8]] += 1 else: if "empty_content" not in final[entry[0]]["beds"].keys(): final[entry[0]]["beds"]["empty_content"] = 1 else: final[entry[0]]["beds"]["empty_content"] += 1 #bathroom if entry[7] is not None: final[entry[0]]["bathroom_count"] += entry[7] return final
def get_testing_listings(list_of_location_ids=[1]): #return listings that have at least one year of occupancy data #And restricted to barcelona global my_features testing_listings = [] thesis_data = database.database("Thesis") query_entries = "" for x in list_of_location_ids: query_entries += str(x) + "," #pop off final coma query_entries = query_entries[:(len(query_entries) - 1)] query = "SELECT `listing_locations_DBSCAN_final`.`listing_id`, `listing_clusters_plain`.`cluster_id`, `listing_locations_DBSCAN_final`.`label_id` FROM `listing_locations_DBSCAN_final` INNER JOIN `listing_clusters_plain` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE `label_id` IN(" + query_entries + ");" initial_data = thesis_data.get_data(query) for listing_data in initial_data: try: sample = my_features.json_files["occupancy_dict"][str( listing_data[0])] testing_listings.append(listing_data) except KeyError: pass thesis_data.destroy_connection() return testing_listings
def insert_zeros(): global listing_dict #get earliest_dates (might as well): data_thesis = database.database("Thesis") earliest_dates = data_thesis.get_data("SELECT * FROM `earliest_date`") for entry in earliest_dates: #for every listing #fetch entry date data #all_dict: status, listing_id, year for status, all_data in all_dict.iteritems(): #entry[0] = listing_id try: for year, days in all_data[entry[0]].iteritems(): for day, occupancy in days.iteritems(): #within the year try: if int( (datetime.datetime.strptime(day, "%Y-%m-%d").date() - entry[1]).days ) > 0: #if the day is after the earliest recorded date of activity if occupancy is None: listing_dict[entry[0]][year][day] = 0 except Exception as e: pass #sys.exit() except Exception as e: #most likely listing has no data for this year pass data_thesis.destroy_connection()
def fill_cluster_data(): _fill_cluster_listings() global clusters_listings, clusters_listings_with_data print "Finished making getting the data structures ready. Time to find averages" ''' Now we begin to find the averages per day ''' data_thesis = database.database("Thesis") #find average for each day (pain in ass) for location_cluster, listing_cluster in clusters_listings.iteritems(): for listing_cluster_id, date_data in listing_cluster.iteritems(): for year, days_data in date_data.iteritems(): #day is a string? for this_day in days_data.keys(): #find the average query = "SELECT `listing_clusters_plain`.`listing_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_clusters_plain`.`listing_id` = `listing_locations_DBSCAN_final`.`listing_id` WHERE `label_id` = %s AND `cluster_id` = %s GROUP BY `listing_id`" occupancy_set = data_thesis.get_data(query % (location_cluster, listing_cluster_id)) #get average and put it in average = _get_average(occupancy_set, this_day) clusters_listings[location_cluster][listing_cluster_id][year][this_day] = average if average: clusters_listings_with_data.append({location_cluster:listing_cluster}) print "finished here's a sample" #print clusters_listings[1][len(clusters_listings[1].keys()) - 2] data_thesis.destroy_connection()
def get_valid_ids(): global valid_ids thesis_data = database.database("Thesis") pot_listings = thesis_data.get_data( "SELECT `listing_id` FROM listing_clusters_plain") valid_ids = [entry[0] for entry in pot_listings] thesis_data.destroy_connection()
def _get_city(location_cluster): thesis_database = database.database("Thesis") data_list = thesis_database.get_data( "SELECT `city` FROM `listing_locations_DBSCAN_final` WHERE `listing_locations_DBSCAN_final`.`label_id` = %s GROUP BY `label_id`" % (int(location_cluster))) thesis_database.destroy_connection() return data_list[0][0]
def get_testing_listings(list_of_location_ids = [1]): #return listings that have at least one year of occupancy data #And restricted to barcelona global my_features testing_listings = [] thesis_data = database.database("Thesis") query_entries = "" for x in list_of_location_ids: query_entries += str(x) + "," #pop off final coma query_entries = query_entries[:(len(query_entries) - 1)] query = "SELECT `listing_locations_DBSCAN_final`.`listing_id`, `listing_clusters_plain`.`cluster_id`, `listing_locations_DBSCAN_final`.`label_id` FROM `listing_locations_DBSCAN_final` INNER JOIN `listing_clusters_plain` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE `label_id` IN(" + query_entries + ");" initial_data = thesis_data.get_data(query) for listing_data in initial_data: try: sample = my_features.json_files["occupancy_dict"][str(listing_data[0])] testing_listings.append(listing_data) except KeyError: pass thesis_data.destroy_connection() return testing_listings
def get_data(): worldhomes_data = database.database("worldhomes") query = "SELECT `listing_id`,`created_at`, `checkin`, `checkout`, `status`, `id`, CASE status WHEN 'CONFIRMED' OR 'BLOCKEDBYCONFIRMED' OR 'CANCELLATIONREQUESTED' OR 'UNAVAILABLE' OR 'DOUBLEBOOKING' THEN 'CONFIRMED' END AS status FROM `reservations` WHERE `additional_description` NOT LIKE 'test ' AND `checkin` >= '2014-01-01' AND `checkout` < '2016-01-30' AND `listing_id` IS NOT NULL AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkout`, `checkin`) >= 0 ORDER BY `status`;" full_data = worldhomes_data.get_data(query) worldhomes_data.destroy_connection() return full_data
def insert_zeros(): global listing_dict active_listing = [] #get earliest_dates (might as well): data_worldhomes = database.database("worldhomes") earliest_dates = data_worldhomes.get_data("SELECT `id`, `created_at` FROM `listings`") earliest_dates_dict = {entry[0]: entry[1] for entry in earliest_dates} end_dates_list = data_worldhomes.get_data("SELECT `id`, `updated_at`, `deleted_at`, `active` FROM `listings`;") #sort into a dictionary end_dates = {entry[0]: {'updated_at': entry[1], 'deleted_at': entry[2], 'active': entry[3]} for entry in end_dates_list} for listing in listing_dict.keys(): #for every listing #fetch entry date data for year, days in listing_dict[listing].iteritems(): for day, occupancy in days.iteritems(): #within the year #see if there is an end date entry end_date = _get_end_date(int(listing), end_dates) if end_date: #if end is still after current day, then positive number days_before_end = int( (end_date.date() - datetime.datetime.strptime(day, "%Y-%m-%d").date()).days) else: days_before_end = None days_after_earliest = int( (datetime.datetime.strptime(day, "%Y-%m-%d").date() - earliest_dates_dict[int(listing)].date()).days) #even if occupancy, before official created date we make none if days_after_earliest: if days_after_earliest < 0: listing_dict[listing][year][day] = None if days_after_earliest and days_before_end: if days_after_earliest >= 0 and days_before_end > 0: if occupancy is None: #print "made something 0 with days_before_end" listing_dict[listing][year][day] = 0 if days_before_end <=0: listing_dict[listing][year][day] = None elif days_after_earliest and not days_before_end: if days_after_earliest >= 0: #if the day is after the earliest recorded date of activity if occupancy is None: #print "made something 0 without days_before end" listing_dict[listing][year][day] = 0 #else: #print "nothing is happening" print "finished inserting zeros, here's what something looks like" print listing_dict[listing_dict.keys()[10]] data_worldhomes.destroy_connection()
def get_data(): query = "SELECT `listing_id`,`checkin`, `checkout`, `status`, `created_at`, `id` FROM `reservations` WHERE (`additional_description` IS NULL OR (`additional_description` NOT LIKE 'test ' AND `additional_description` NOT LIKE ' test')) AND `checkin` >= '2013-01-01' AND `checkout` < '2016-01-30' AND `status` IN('ENQUIRY', 'CANCELLED') AND `listing_id` IS NOT NULL AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkin`, `created_at`) >= 0 AND DATEDIFF(`checkout`, `checkin`) > 0 ORDER BY `status`;" database_worldhomes = database.database("worldhomes") database_worldhomes.cursor.execute(query) all_data = list(database_worldhomes.cursor.fetchall()) database_worldhomes.destroy_connection() return all_data
def fill_weights(): global listing_weights, bed_weights from library import database thesis_data = database.database("Thesis") data_list = thesis_data.get_data("SELECT * FROM `types_of_listings`;") listing_weights = {entry[0]: {'small': entry[4], 'big': entry[3]} for entry in data_list} data_list = thesis_data.get_data("SELECT * FROM `bed_types`;") bed_weights = {entry[0]: {"small": entry[2], 'big': entry[1]} for entry in data_list} thesis_data.destroy_connection()
def fill_global_data(inclusive = True): global listing_important_dates, json_files, feature_data_space, listing_cluster_normalisation #json_files loading: occupancy_dict, date_dict for filename in json_files.keys(): json_files[filename] = _load_json(filename, inclusive) #earliest dates worldhomes_data = database.database("worldhomes") listing_important_dates_list = worldhomes_data.get_data("SELECT `id`, `created_at`, `updated_at`, `deleted_at`, `active` FROM `listings`") listing_important_dates = {entry[0]: {"created_at": entry[1], "updated_at": entry[2], "deleted_at": entry[3], "active": entry[4]} for entry in listing_important_dates_list} worldhomes_data.destroy_connection()
def main(): query = "SELECT `listing_id`,`checkin`, `checkout`, `created_at`, `additional_description` FROM `reservations` WHERE (`additional_description` IS NULL OR (`additional_description` NOT LIKE 'test ' AND `additional_description` NOT LIKE ' test')) AND `checkin` >= '2013-01-01' AND `checkout` < '2016-1-29' AND `listing_id` IS NOT NULL AND `status` IN('CONFIRMED','BLOCKEDBYCONFIRMED','UNAVAILABLE', 'DOUBLEBOOKING', 'CANCELLATIONREQUESTED') AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkin`, `created_at`) > 0;" woldhomes_data = database.database("worldhomes") my_data = woldhomes_data.get_data(query) thesis_data = database.database("Thesis") valid_listings = thesis_data.get_data( "SELECT `listing_id` FROM listing_locations_DBSCAN_final WHERE label_id IN(0, 1, 19);" ) valid_listings = [entry[0] for entry in valid_listings] for entry in my_data: process_entry(entry, valid_listings) print "finished processing entries in my data, now need to insert 0's" insert_zeros() print "now let's find the average cluster data for each day" fill_cluster_data() global clusters_listings # print "at least putting things into a json" cluster_filepath = "data/cluster_averages.json" occupancy_filepath = "data/occupancy_dict.json" with open(cluster_filepath, 'w') as outfile: json.dump(clusters_listings, outfile) with open(occupancy_filepath, 'w') as outfile2: json.dump(listing_dict, outfile2) print "Finished saving the average cluster listings data. Onto occupancy_dict" woldhomes_data.destroy_connection()
def get_data(custom_prices = True): global earliest_dates worldhomes_data = database.database("worldhomes") earliest_dates_list = worldhomes_data.get_data("SELECT `id`, `created_at` FROM `listings`") earliest_dates = {entry[0]: entry[1] for entry in earliest_dates_list} if custom_prices is True: query = 'SELECT `listing_id`,`date_start`, `date_end`, `rate` FROM `listing_custom_prices` WHERE date_end >= "2013-01-01" AND date_start < "2016-1-30" AND date_start > "2012-01-01" ORDER BY `date_start` ASC;' else: query = 'SELECT `id`,`rate` FROM `listings` WHERE master_id IS NULL;' final = worldhomes_data.get_data(query) worldhomes_data.destroy_connection() return final
def main(): query = "SELECT `listing_id`,`checkin`, `checkout`, `created_at`, `additional_description` FROM `reservations` WHERE (`additional_description` IS NULL OR (`additional_description` NOT LIKE 'test ' AND `additional_description` NOT LIKE ' test')) AND `checkin` >= '2013-01-01' AND `checkout` < '2016-1-29' AND `listing_id` IS NOT NULL AND `status` IN('CONFIRMED','BLOCKEDBYCONFIRMED','UNAVAILABLE', 'DOUBLEBOOKING', 'CANCELLATIONREQUESTED') AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkin`, `created_at`) > 0;" woldhomes_data = database.database("worldhomes") my_data = woldhomes_data.get_data(query) thesis_data = database.database("Thesis") valid_listings = thesis_data.get_data("SELECT `listing_id` FROM listing_locations_DBSCAN_final WHERE label_id IN(0, 1, 19);") valid_listings = [entry[0] for entry in valid_listings] for entry in my_data: process_entry(entry, valid_listings) print "finished processing entries in my data, now need to insert 0's" insert_zeros() print "now let's find the average cluster data for each day" fill_cluster_data() global clusters_listings # print "at least putting things into a json" cluster_filepath = "data/cluster_averages.json" occupancy_filepath = "data/occupancy_dict.json" with open(cluster_filepath, 'w') as outfile: json.dump(clusters_listings, outfile) with open(occupancy_filepath, 'w') as outfile2: json.dump(listing_dict, outfile2) print "Finished saving the average cluster listings data. Onto occupancy_dict" woldhomes_data.destroy_connection()
def main(): global all_prices_calendar, valid_listings all_data = get_data() #make structure and only do it for listing_ids that have data thesis_data = database.database("Thesis") pot_listings = thesis_data.get_data( "SELECT `listing_id` FROM listing_clusters_plain") valid_listings = [entry[0] for entry in pot_listings] thesis_data.destroy_connection() for listing_id in valid_listings: all_prices_calendar[listing_id] = time.default_date_structure() #fill structure: for item in all_data: process_entry(item) #get default_rate data: #just id, rate all_data = get_data(False) for entry in all_data: listing_id = entry[0] if listing_id not in all_prices_calendar.keys(): #need to get like this: #listing_id`,`date_start`, `date_end`, `rate` all_prices_calendar[listing_id] = time.default_date_structure() to_process = [ listing_id, datetime.date(2014, 1, 1), datetime.date(2016, 1, 29), entry[1] ] is_none = process_entry(to_process) if is_none is True: del all_prices_calendar[listing_id] #fill the ends of the dates and really make sure there are no 0's or None for listing_id, default_date_dict in all_prices_calendar.iteritems(): fill_nones(listing_id, datetime.date(2016, 1, 29)) #save it with open("data/price_dict.json", 'w') as jsonFile: json.dump(all_prices_calendar, jsonFile) print "Finished!!! "
def get_data(custom_prices=True): global earliest_dates worldhomes_data = database.database("worldhomes") earliest_dates_list = worldhomes_data.get_data( "SELECT `id`, `created_at` FROM `listings`") earliest_dates = {entry[0]: entry[1] for entry in earliest_dates_list} if custom_prices is True: query = 'SELECT `listing_id`,`date_start`, `date_end`, `rate` FROM `listing_custom_prices` WHERE date_end >= "2013-01-01" AND date_start < "2016-1-30" AND date_start > "2012-01-01" ORDER BY `date_start` ASC;' else: query = 'SELECT `id`,`rate` FROM `listings` WHERE master_id IS NULL;' final = worldhomes_data.get_data(query) worldhomes_data.destroy_connection() return final
def save_to_database(city, feature_header, feature_varience, experiment_name): thesis_data = database.database("Thesis") location_dict = {1: "Barcelona", 0: "Rome", 19: "Amsterdam"} #clear table query = "DELETE FROM `feature_varience` WHERE `city` = '" + location_dict[city] + "' AND `type_standardsiation` = '" + experiment_name + "';" thesis_data.execute(query) #table: feature_variance #city, feature_name, variance, type_standardsiation query = "INSERT INTO `feature_varience` VALUES ('%s', '%s', %s, '%s');" for x, entry in enumerate(feature_header): to_insert = (location_dict[city], entry, feature_varience[x], experiment_name) thesis_data.execute(query % to_insert) print "Finished saving to database"
def main(): #save listing_features into listing_plain_features #listing_id INT, property_title TEXT, description TEXT, property_type INT, size INT, room_count INT, bathroom_count INT, beds TEXT #get all data thesis_data = database.database("Thesis") query = "SELECT `listing_id`, `property_type`, `room_count`, `bathroom_count`, `beds`, `accomodates` FROM `listing_plain_features` WHERE `accomodates` IS NOT NULL;" data = thesis_data.get_data(query) # data_dict = {entry[0]: {"property_type": entry[1], "room_count": entry[2], "bathroom_count": entry[3], "beds": entry[4], 'accommodates': entry[5]} for entry in data} #set weights fill_weights() features_plain = calculate_features(data_dict, "plain") save_calculated_features(features_plain, "features_plain_gobig") features_fancy = calculate_features(data_dict, "fancy") #with dimensions all within 0-1 save_calculated_features(features_fancy, "features_fancy")
def save_calculated_features(feature_list, database_name): #do later #features_plain_gobig_new #features_plain_gosmall_new thesis_data = database.database("Thesis") thesis_data.clear_table(database_name) for entry in feature_list: insert_string = '' for item in entry: insert_string = insert_string + str(item) + ", " insert_string = insert_string[: len(insert_string) - 2] #pop off last coma #print insert_string query = "INSERT INTO `" + database_name + "` VALUES(" + insert_string + ");" thesis_data.execute(query) thesis_data.destroy_connection()
def main(): global full_reservation_data all_data = get_data() #get valid_listings and make reservation structure thesis_data = database.database("Thesis") valid_listings = thesis_data.get_data("SELECT `listing_id` FROM listing_clusters_plain") full_reservation_data = {entry[0]: time.default_date_structure() for entry in valid_listings} for entry in all_data: if entry[0] in full_reservation_data.keys(): process_data(entry) with open("data/monte_carlo_reservation_dict.json", 'w') as outFile: json.dump(full_reservation_data, outFile) thesis_data.destroy_connection()
def main(): global all_prices_calendar, valid_listings all_data = get_data() #make structure and only do it for listing_ids that have data thesis_data = database.database("Thesis") pot_listings = thesis_data.get_data("SELECT `listing_id` FROM listing_clusters_plain") valid_listings = [entry[0] for entry in pot_listings] thesis_data.destroy_connection() for listing_id in valid_listings: all_prices_calendar[listing_id] = time.default_date_structure() #fill structure: for item in all_data: process_entry(item) #get default_rate data: #just id, rate all_data = get_data(False) for entry in all_data: listing_id = entry[0] if listing_id not in all_prices_calendar.keys(): #need to get like this: #listing_id`,`date_start`, `date_end`, `rate` all_prices_calendar[listing_id] = time.default_date_structure() to_process = [listing_id, datetime.date(2014, 1, 1), datetime.date(2016, 1, 29), entry[1]] is_none = process_entry(to_process) if is_none is True: del all_prices_calendar[listing_id] #fill the ends of the dates and really make sure there are no 0's or None for listing_id, default_date_dict in all_prices_calendar.iteritems(): fill_nones(listing_id, datetime.date(2016, 1, 29)) #save it with open("data/price_dict.json", 'w') as jsonFile: json.dump(all_prices_calendar, jsonFile) print "Finished!!! "
def _fill_cluster_listings(): data_thesis = database.database("Thesis") global clusters_listings #get location clusters location_clusters = data_thesis.get_data("SELECT `listing_clusters_plain`.`listing_id`,`listing_locations_DBSCAN_final`.`label_id`, `listing_clusters_plain`.`cluster_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE label_id != -1;") #trying go big for now #make structure for the pulled data #`listing_id`, `location_cluster`, `cluster` # # #structure for the final datastructure #location_cluster : { listing_cluster: { year: {day: percentage occupied}}} for entry in location_clusters: #if the location cluster if entry[1] not in clusters_listings.keys(): clusters_listings[entry[1]] = {} #if the listing_cluster if entry[2] not in clusters_listings[entry[1]].keys(): clusters_listings[entry[1]][entry[2]] = default_date_structure() data_thesis.destroy_connection()
def main(): global full_reservation_data all_data = get_data() #get valid_listings and make reservation structure thesis_data = database.database("Thesis") valid_listings = thesis_data.get_data( "SELECT `listing_id` FROM listing_clusters_plain") full_reservation_data = { entry[0]: time.default_date_structure() for entry in valid_listings } for entry in all_data: if entry[0] in full_reservation_data.keys(): process_data(entry) with open("data/monte_carlo_reservation_dict.json", 'w') as outFile: json.dump(full_reservation_data, outFile) thesis_data.destroy_connection()
def main(): from library import database #don't know why all of the sudden this needs to be in the function data_thesis = database.database("Thesis") query_plain = "SELECT * FROM `features_plain_gobig`;" query_fancy = "SELECT * FROM `features_fancy`;" data_plain = data_thesis.get_data(query_plain) data_fancy = data_thesis.get_data(query_fancy) #sorting data go_plain = [[float(item) for item in entry[1:]] for entry in data_plain] go_plain_identification = [entry[0] for entry in data_plain] go_fancy = [[float(item) for item in entry[1:]] for entry in data_fancy] go_fancy_identification = [entry[0] for entry in data_fancy] print "Let's try to isolate clusters by location!" #_DBSCAN(cluster_data, identification, db) plain_final = _Dirichlet(go_plain, go_plain_identification) fancy_final = _Dirichlet(go_fancy, go_fancy_identification) ''' go big: 11 clusters (full data) go small = 13 clusters (full data) data: listing_id, cluster ''' #save data data_thesis.clear_table("listing_clusters_fancy") data_thesis.clear_table("listing_clusters_plain") for x in range(0, len(plain_final)): data_thesis.add_entry_to_table("listing_clusters_plain", plain_final[x]) data_thesis.add_entry_to_table("listing_clusters_fancy", fancy_final[x]) data_thesis.destroy_connection()
def main(): from library import database # don't know why all of the sudden this needs to be in the function data_thesis = database.database("Thesis") query_plain = "SELECT * FROM `features_plain_gobig`;" query_fancy = "SELECT * FROM `features_fancy`;" data_plain = data_thesis.get_data(query_plain) data_fancy = data_thesis.get_data(query_fancy) # sorting data go_plain = [[float(item) for item in entry[1:]] for entry in data_plain] go_plain_identification = [entry[0] for entry in data_plain] go_fancy = [[float(item) for item in entry[1:]] for entry in data_fancy] go_fancy_identification = [entry[0] for entry in data_fancy] print "Let's try to isolate clusters by location!" # _DBSCAN(cluster_data, identification, db) plain_final = _Dirichlet(go_plain, go_plain_identification) fancy_final = _Dirichlet(go_fancy, go_fancy_identification) """ go big: 11 clusters (full data) go small = 13 clusters (full data) data: listing_id, cluster """ # save data data_thesis.clear_table("listing_clusters_fancy") data_thesis.clear_table("listing_clusters_plain") for x in range(0, len(plain_final)): data_thesis.add_entry_to_table("listing_clusters_plain", plain_final[x]) data_thesis.add_entry_to_table("listing_clusters_fancy", fancy_final[x]) data_thesis.destroy_connection()
def save_to_database(table_name, experiment_name, city_name, full_dict): thesis_data = database.database("Thesis") #delete similar entries query = "DELETE FROM `" + table_name + "` WHERE `city` = '" + city_name + "' AND `experiment` = '" + experiment_name + "';" #print query thesis_data.execute(query) print "saving to database " + table_name + " experiment results: " + experiment_name #put entries in, then the keys are lists and what I want to store are the true_true, if full_dict and isinstance(full_dict.keys()[0], long): insert_query = "INSERT INTO " + table_name + " VALUES('%s','%s',%s,'%s',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" for listing_id, full_records in full_dict.iteritems(): #experiment , city , listing_id, method, true_true, true_false, false_true, false_false, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne, correct_overall for method, method_results in full_records.iteritems(): to_insert = [experiment_name, city_name, listing_id, method] for this_thing in ["true_true", "true_false", "false_true", "false_false", "occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne", "correct_overall"]: if method_results[this_thing]: to_insert.append(method_results[this_thing]) else: to_insert.append("null") #print (insert_query % to_insert) thesis_data.execute(insert_query % tuple(to_insert)) elif full_dict: insert_query = "INSERT INTO " + table_name + " VALUES('%s','%s','%s',%s,%s,%s,%s,%s, %s)" #experiment , city , method, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne, for method, method_results in full_dict.iteritems(): to_insert = [experiment_name, city_name, method] for this_thing in ["occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne"]: if method_results[this_thing]: to_insert.append(method_results[this_thing]) else: to_insert.append("null") thesis_data.execute(insert_query % tuple(to_insert)) thesis_data.destroy_connection()
def _fill_cluster_listings(): data_thesis = database.database("Thesis") global clusters_listings #get location clusters location_clusters = data_thesis.get_data( "SELECT `listing_clusters_plain`.`listing_id`,`listing_locations_DBSCAN_final`.`label_id`, `listing_clusters_plain`.`cluster_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE label_id != -1;" ) #trying go big for now #make structure for the pulled data #`listing_id`, `location_cluster`, `cluster` # # #structure for the final datastructure #location_cluster : { listing_cluster: { year: {day: percentage occupied}}} for entry in location_clusters: #if the location cluster if entry[1] not in clusters_listings.keys(): clusters_listings[entry[1]] = {} #if the listing_cluster if entry[2] not in clusters_listings[entry[1]].keys(): clusters_listings[entry[1]][entry[2]] = default_date_structure() data_thesis.destroy_connection()
def save_general_features(full_listing_data): thesis_data = database.database("Thesis") #listing_id INT, property_title TEXT, description TEXT, property_type INT, size INT, room_count INT, bathroom_count INT, beds TEXT #truncate thesis_data.clear_table("listing_plain_features") query = "INSERT INTO `listing_plain_features` VALUES(%s, \"%s\", '%s', %s,%s, %s, %s, \"%s\", %s)" for listing_id, listing_data in full_listing_data.iteritems(): beds = str(listing_data["beds"]) #print query % (listing_id, listing_data["property_title"], listing_data["description"], listing_data["property_type"], listing_data["size"], len(listing_data["rooms"]), listing_data["bathroom_count"], beds) if listing_data["property_title"]: title = listing_data["property_title"] else: title = "null" if listing_data["description"]: description = listing_data["description"] else: description = "null" if listing_data["property_type"]: property_type = listing_data["property_type"] else: property_type = "null" if listing_data["size"]: size = listing_data["size"] else: size = "null" if listing_data["rooms"]: room_count = len(listing_data["rooms"]) else: room_count = "null" if listing_data["accomodates"]: accomodates = listing_data["accomodates"] else: accomodates = "null" thesis_data.execute(query % (listing_id, title, description, property_type, size, room_count, listing_data["bathroom_count"], beds, accomodates)) print "saved entry for, ", listing_id
with open("data/monte_carlo_reservation_dict.json") as jsonFile: all_reservations_dict = json.load(jsonFile) with open('data/k-means_season_clusters.json') as jsonFile: k_means = json.load(jsonFile) with open ('data/occupancy_dict.json') as jsonFile: occupancy_dict = json.load(jsonFile) listing_location = common_database_functions.location_listing_pairings(True) #final: listing_id: day: {reservations: {}, k_cluster: , occupancy: } final = {} #find valid listings with the right created-at dates worldhomes_data = database.database("worldhomes") enough_data_listings = worldhomes_data.get_data("SELECT `id` FROM `listings` WHERE `created_at` > '2013-12-31';") enough_data_listings = [entry[0] for entry in enough_data_listings] valid_ids = [this_id for this_id in occupancy_dict.keys() if int(this_id) in enough_data_listings] for listing_id in valid_ids: final[listing_id] = {} for day in my_time._daterange(datetime.date(2014, 1, 1) , datetime.date(2016, 1,30) ): #does not include the outer bound year = str(day.year) final[listing_id][str(day)] = {} try: if all_reservations_dict[listing_id][year][str(day)]: final[listing_id][str(day)]['reservations'] = all_reservations_dict[listing_id][year][str(day)] else:
def insert_zeros(): global listing_dict active_listing = [] #get earliest_dates (might as well): data_worldhomes = database.database("worldhomes") earliest_dates = data_worldhomes.get_data( "SELECT `id`, `created_at` FROM `listings`") earliest_dates_dict = {entry[0]: entry[1] for entry in earliest_dates} end_dates_list = data_worldhomes.get_data( "SELECT `id`, `updated_at`, `deleted_at`, `active` FROM `listings`;") #sort into a dictionary end_dates = { entry[0]: { 'updated_at': entry[1], 'deleted_at': entry[2], 'active': entry[3] } for entry in end_dates_list } for listing in listing_dict.keys(): #for every listing #fetch entry date data for year, days in listing_dict[listing].iteritems(): for day, occupancy in days.iteritems(): #within the year #see if there is an end date entry end_date = _get_end_date(int(listing), end_dates) if end_date: #if end is still after current day, then positive number days_before_end = int( (end_date.date() - datetime.datetime.strptime( day, "%Y-%m-%d").date()).days) else: days_before_end = None days_after_earliest = int( (datetime.datetime.strptime(day, "%Y-%m-%d").date() - earliest_dates_dict[int(listing)].date()).days) #even if occupancy, before official created date we make none if days_after_earliest: if days_after_earliest < 0: listing_dict[listing][year][day] = None if days_after_earliest and days_before_end: if days_after_earliest >= 0 and days_before_end > 0: if occupancy is None: #print "made something 0 with days_before_end" listing_dict[listing][year][day] = 0 if days_before_end <= 0: listing_dict[listing][year][day] = None elif days_after_earliest and not days_before_end: if days_after_earliest >= 0: #if the day is after the earliest recorded date of activity if occupancy is None: #print "made something 0 without days_before end" listing_dict[listing][year][day] = 0 #else: #print "nothing is happening" print "finished inserting zeros, here's what something looks like" print listing_dict[listing_dict.keys()[10]] data_worldhomes.destroy_connection()
def fill_records_total_counts_data(): structure_records_total_counts() global records, records_total_counts, max_num print "Finished making getting the data structures ready" ''' Now we begin to find the averages per day ''' data_thesis = database.database("Thesis") #find average for each day (pain in ass) for filename in ["ENQUIRY", "CANCELLED"]: print "on: ", filename #for filename in ["ENQUIRY"]: for location_cluster, listing_cluster in records_total_counts[filename].iteritems(): print "On location, ", location_cluster #for location_cluster in [1]: #print "On location: ", location_cluster for listing_cluster_id, date_data in records_total_counts[filename][location_cluster].iteritems(): #for x in range(1,5): for year, days_data in date_data.iteritems(): #day is a string? #print "On lisitng_cluster, ", x #listing_cluster_id = x #for year in ["2013"]: for this_day, day_list in days_data.iteritems(): days_data[this_day] = [] #find the average query = "SELECT `features_plain_gobig`.`listing_id` FROM `features_plain_gobig` INNER JOIN `listing_clusters_big_all` ON `features_plain_gobig`.`listing_id` = `listing_clusters_big_all`.`listing_id` WHERE `location_cluster` = %s AND `cluster` = %s GROUP BY `listing_id`" listing_id_set = data_thesis.get_data(query % (location_cluster, listing_cluster_id)) #type check if isinstance(listing_id_set[0], tuple) or isinstance(listing_id_set[0], list): listing_id_set = [entry[0] for entry in listing_id_set] #process for listing_id in listing_id_set: try: days_data[this_day].append(len( records[filename][str(listing_id)][year][this_day].keys()) ) #print "adding entry for cluster", x, ": ", this_day, ": ", days_data[this_day] except Exception: #there's nothing on this day in the database #check to see if active date try: date_activity = records["occupancy_dict"][str(listing_id)][year][this_day] if date_activity == 0: days_data[this_day].append(0) except KeyError: #there is no occupancy data, let's delete this entire listing pass #do average calculations here in the year section print "doing averages for year, ", year to_add = fill_records_averages(days_data) records_averages[filename][location_cluster][listing_cluster_id][year] = fill_records_averages(days_data) if to_add is False: del records_averages_full_year[filename][year][location_cluster][listing_cluster_id] else: "Huzzah some records for year, ", year, "location, ", location_cluster records_averages_full_year[filename][year][location_cluster][listing_cluster_id][year] = to_add data_thesis.destroy_connection()
def _get_city(location_cluster): thesis_database = database.database("Thesis") data_list = thesis_database.get_data("SELECT `city` FROM `listing_locations_DBSCAN_final` WHERE `listing_locations_DBSCAN_final`.`label_id` = %s GROUP BY `label_id`" % (int(location_cluster)) ) thesis_database.destroy_connection() return data_list[0][0]
def get_valid_ids(): global valid_ids thesis_data = database.database("Thesis") pot_listings = thesis_data.get_data("SELECT `listing_id` FROM listing_clusters_plain") valid_ids = [entry[0] for entry in pot_listings] thesis_data.destroy_connection()