def get_relative_portion_of_missing_values(): train_data = GlobalData("train") item_helper = ItemHelper() result = [] for key in item_helper.get_all_column_names(): train_data.load(key) np_array = train_data.get(key) if np_array.dtype.char == "d": result.append((key, np.count_nonzero(np.isnan(np_array)), np_array.size)) train_data.discard(key) return result
def combine_npys(data_name): data = GlobalData(data_name) item_helper = ItemHelper() keys = item_helper.get_all_column_names_new() result = np.load(data.get_path(keys[0])) for i in range (1, len(keys)): np_array = np.load(data.get_path(keys[i])) logger.info("Combining {0} with {1} in total \n.".format(keys[i], np_array.size)) result = np.vstack((result, np_array)) np.save("data_numpy/combined_"+data_name+".npy", result)
def sampling_data(data_name, sampling_rate, method_type): data = GlobalData(data_name) # normalized_train_attri sampled_data = GlobalData("sampled_"+data_name) item_helper = ItemHelper () if(method_type ==1): sampled = np.random.choice(9917530, int(sampling_rate*9917530)) elif(method_type==2): sampled = np.array(range(0,int(sampling_rate*9917530))) for key in item_helper.get_all_column_names_new(): np_array = np.load(data.get_path(key)) logger.info("Sampling on the attribute ({0}) with {1} in total.".format(key, np_array.size)) np.save(sampled_data.get_path(key), np_array[sampled])
def main(): item_helper = ItemHelper() columns = item_helper.get_all_column_names_new() original_train_data = np.load("data_numpy/combined_sampled_normalized_valid_train.npy") attr_search_id = original_train_data[columns.index("srch_id"), :] attr_prop_location_score2 = original_train_data[columns.index("prop_location_score2"), :] attr_booking = original_train_data[columns.index("booking_bool"), :] train_data = np.zeros(attr_booking.shape) train_data = np.vstack((train_data, attr_search_id)) train_data = np.vstack((train_data, attr_prop_location_score2)) train_data = np.transpose(train_data) #model = thirdparty.LambdaMart.mart.learn(train_data, n_trees = 10) print train_data[ train_data[:, 0] == 1, : ]
def main(): item_helper = ItemHelper() columns = item_helper.get_all_column_names_new() train_data = np.load("data_numpy/combined_sampled_normalized_valid_train.npy") feature_vector = train_data[columns.index("prop_location_score2"), :] score_vector = train_data[columns.index("booking_bool"), :] data_without_score = np.zeros(score_vector.shape) # Just for assigning the placeholder. data_without_score = np.vstack((data_without_score, feature_vector)) train_data = score_vector train_data = np.vstack((train_data, data_without_score)) train_data = np.transpose(train_data) data_without_score = np.transpose(data_without_score) for x in train_data: print "{0} qid:{1}".format(x[0],int(x[1])), for idx, val in enumerate(x[2:]): print "{0}:{1}".format(idx+1, val), print
def main(): item_helper = ItemHelper() columns = item_helper.get_all_column_names_new() train_data = np.load( "data_numpy/combined_sampled_normalized_valid_train.npy") feature_vector = train_data[columns.index("prop_location_score2"), :] score_vector = train_data[columns.index("booking_bool"), :] data_without_score = np.zeros( score_vector.shape) # Just for assigning the placeholder. data_without_score = np.vstack((data_without_score, feature_vector)) train_data = score_vector train_data = np.vstack((train_data, data_without_score)) train_data = np.transpose(train_data) data_without_score = np.transpose(data_without_score) for x in train_data: print "{0} qid:{1}".format(x[0], int(x[1])), for idx, val in enumerate(x[2:]): print "{0}:{1}".format(idx + 1, val), print
def convert_data_to_numpy(path, train_data): line_number = 0 with open(path) as fp: temp = fp.readline().strip().split( ",") # to ignore the header but to count the number of fields num_of_fields = len(temp) logger.info("Number of fields: {0}".format(num_of_fields)) # DO NOT TRY TO UPDATE CONVERT EVERY ATTRIBUTE (IT WILL CONSUME HUGE MEMORY SPACE) need_to_be_convert = [ #"srch_id", #"date_time", # "site_id", # "visitor_location_country_id", # "visitor_hist_starrating", # "visitor_hist_adr_usd", # "prop_country_id", # "prop_id", # "prop_starrating", # "prop_review_score", # "prop_brand_bool", # "prop_location_score1", # # "prop_location_score2", # "prop_log_historical_price", # "position", # "price_usd", # "promotion_flag", # "srch_destination_id", # "srch_length_of_stay", # "srch_booking_window", # "srch_adults_count", # "srch_children_count", # "srch_room_count", # "srch_saturday_night_bool", # "srch_query_affinity_score", # "orig_destination_distance", # "random_bool", # "comp1_rate", # "comp1_inv", # "comp1_rate_percent_diff", # "comp2_rate", # "comp2_inv", # "comp2_rate_percent_diff", # "comp3_rate", # "comp3_inv", # "comp3_rate_percent_diff", # "comp4_rate", # "comp4_inv", # "comp4_rate_percent_diff", # "comp5_rate", # "comp5_inv", # "comp5_rate_percent_diff", # "comp6_rate", # "comp6_inv", # "comp6_rate_percent_diff", # "comp7_rate", # "comp7_inv", # "comp7_rate_percent_diff", # "comp8_rate", # "comp8_inv", # "comp8_rate_percent_diff", "click_bool", # "gross_bookings_usd", "booking_bool", ] item_helper = ItemHelper() entire_data = [] mask_entire_data = map(lambda x: item_helper.get_column_index_of(x), need_to_be_convert) #for i in range(0,100000): # partial convertion #linebuf = fp.readline() for linebuf in fp: # full convertion line_number = line_number + 1 fields = linebuf.strip().split(",") if (line_number % 1000 == 0): print "Reading the line : {0}\r".format(line_number), if (len(fields) == num_of_fields): entire_data.append(map(lambda i: fields[i], mask_entire_data)) else: logger.warning("Mismatching fields: {0}".format(fields)) print "" np_array_entire_data = np.array(entire_data) for idx, val in enumerate(need_to_be_convert): selected = np_array_entire_data[:, idx] train_data.convert(val, selected, True) selected = None logger.info("Completed: {0}".format(line_number))
def convert_data_to_numpy(path, train_data): line_number = 0 with open(path) as fp: temp = fp.readline().strip().split(",") # to ignore the header but to count the number of fields num_of_fields = len(temp) logger.info("Number of fields: {0}".format(num_of_fields)) # DO NOT TRY TO UPDATE CONVERT EVERY ATTRIBUTE (IT WILL CONSUME HUGE MEMORY SPACE) need_to_be_convert = [ #"srch_id", #"date_time", # "site_id", # "visitor_location_country_id", # "visitor_hist_starrating", # "visitor_hist_adr_usd", # "prop_country_id", # "prop_id", # "prop_starrating", # "prop_review_score", # "prop_brand_bool", # "prop_location_score1", # # "prop_location_score2", # "prop_log_historical_price", # "position", # "price_usd", # "promotion_flag", # "srch_destination_id", # "srch_length_of_stay", # "srch_booking_window", # "srch_adults_count", # "srch_children_count", # "srch_room_count", # "srch_saturday_night_bool", # "srch_query_affinity_score", # "orig_destination_distance", # "random_bool", # "comp1_rate", # "comp1_inv", # "comp1_rate_percent_diff", # "comp2_rate", # "comp2_inv", # "comp2_rate_percent_diff", # "comp3_rate", # "comp3_inv", # "comp3_rate_percent_diff", # "comp4_rate", # "comp4_inv", # "comp4_rate_percent_diff", # "comp5_rate", # "comp5_inv", # "comp5_rate_percent_diff", # "comp6_rate", # "comp6_inv", # "comp6_rate_percent_diff", # "comp7_rate", # "comp7_inv", # "comp7_rate_percent_diff", # "comp8_rate", # "comp8_inv", # "comp8_rate_percent_diff", "click_bool", # "gross_bookings_usd", "booking_bool", ] item_helper = ItemHelper() entire_data = [] mask_entire_data = map(lambda x: item_helper.get_column_index_of(x), need_to_be_convert) #for i in range(0,100000): # partial convertion #linebuf = fp.readline() for linebuf in fp: # full convertion line_number = line_number + 1 fields = linebuf.strip().split(",") if (line_number % 1000 == 0 ): print "Reading the line : {0}\r".format(line_number), if (len(fields)==num_of_fields): entire_data.append(map(lambda i: fields[i], mask_entire_data)) else: logger.warning("Mismatching fields: {0}".format(fields)) print "" np_array_entire_data = np.array(entire_data) for idx, val in enumerate(need_to_be_convert): selected = np_array_entire_data[:,idx] train_data.convert(val, selected, True) selected = None logger.info("Completed: {0}".format(line_number))
def extract_features(_data, _selected_features_index, _column_names): _features = _data[:, _selected_features_index] _feature_srch_id = _data[:, _column_names.index("srch_id")] _feature_hotel_id = _data[:, _column_names.index("prop_id")] _y_click = _data[:, _column_names.index("click_bool")] _y_book = _data[:, _column_names.index("booking_bool")] return _features, _feature_srch_id, _feature_hotel_id, _y_click, _y_book # Load Data # ========= path = os.getcwd() data = np.load(path + '/validdata/combined_sampled_normalized_valid_train.npy') data = np.transpose(data) item_helper = ItemHelper() column_names = item_helper.get_all_column_names_new() import sys num_of_selection = int(sys.argv[1]) data2 = data[num_of_selection:num_of_selection * 2, :] data = data[:num_of_selection, :] #data2=data # %data2=data # booked_index = data[:, column_names.index("booking_bool")] # data_booked = data[booked_index==1,:] # data_not_booked = data[booked_index==0,:] # data_booked_selected1 = data_booked[:num_of_selection,:] # data_booked_selected2 = data_booked[num_of_selection:num_of_selection*2,:]
def get_correlation_matrix(): item_helper = ItemHelper() column_names = item_helper.get_all_column_names_new() for i in range(0,len(column_names)): for j in range(i+1,len(column_names)): print column_names[i], column_names[j]
def extract_features(_data, _selected_features_index, _column_names): _features = _data[:,_selected_features_index] _feature_srch_id = _data[:,_column_names.index("srch_id")] _feature_hotel_id = _data[:,_column_names.index("prop_id")] _y_click = _data[:, _column_names.index("click_bool")] _y_book = _data[:, _column_names.index("booking_bool")] return _features, _feature_srch_id, _feature_hotel_id, _y_click, _y_book # Load Data # ========= path=os.getcwd() data=np.load(path+'/validdata/combined_sampled_normalized_valid_train.npy') data=np.transpose(data) item_helper = ItemHelper() column_names = item_helper.get_all_column_names_new() import sys num_of_selection = int(sys.argv[1]) data2=data[num_of_selection:num_of_selection*2,:] data=data[:num_of_selection,:] #data2=data # %data2=data # booked_index = data[:, column_names.index("booking_bool")] # data_booked = data[booked_index==1,:] # data_not_booked = data[booked_index==0,:]