Пример #1
0
def get_relative_portion_of_missing_values():
    train_data = GlobalData("train")
    item_helper = ItemHelper()
    result = []
    for key in item_helper.get_all_column_names():
        train_data.load(key)
        np_array = train_data.get(key)
        if np_array.dtype.char == "d":
            result.append((key, np.count_nonzero(np.isnan(np_array)), np_array.size))
            train_data.discard(key)
    return result
def combine_npys(data_name):

    data = GlobalData(data_name)
    item_helper = ItemHelper()
    keys = item_helper.get_all_column_names_new()
    result = np.load(data.get_path(keys[0]))
    for i in range (1, len(keys)):
        np_array = np.load(data.get_path(keys[i]))
        logger.info("Combining {0} with {1} in total \n.".format(keys[i], np_array.size))
        result = np.vstack((result, np_array))
    np.save("data_numpy/combined_"+data_name+".npy", result)
def sampling_data(data_name, sampling_rate, method_type):

    data = GlobalData(data_name) # normalized_train_attri
    sampled_data = GlobalData("sampled_"+data_name)
    item_helper = ItemHelper ()
    if(method_type ==1):
        sampled = np.random.choice(9917530, int(sampling_rate*9917530))
    elif(method_type==2):
        sampled = np.array(range(0,int(sampling_rate*9917530)))
    for key in item_helper.get_all_column_names_new():
        np_array = np.load(data.get_path(key))
        logger.info("Sampling on the attribute ({0}) with {1} in total.".format(key, np_array.size))
        np.save(sampled_data.get_path(key), np_array[sampled])
Пример #4
0
def main():
    item_helper = ItemHelper()
    columns = item_helper.get_all_column_names_new()

    original_train_data = np.load("data_numpy/combined_sampled_normalized_valid_train.npy")


    attr_search_id = original_train_data[columns.index("srch_id"), :]
    attr_prop_location_score2 = original_train_data[columns.index("prop_location_score2"), :]
    attr_booking = original_train_data[columns.index("booking_bool"), :]

    train_data = np.zeros(attr_booking.shape)
    train_data = np.vstack((train_data, attr_search_id))
    train_data = np.vstack((train_data, attr_prop_location_score2))

    train_data = np.transpose(train_data)

    #model = thirdparty.LambdaMart.mart.learn(train_data, n_trees = 10)

    print train_data[ train_data[:, 0] == 1, : ]
Пример #5
0
def main():
    item_helper = ItemHelper()
    columns = item_helper.get_all_column_names_new()

    train_data = np.load("data_numpy/combined_sampled_normalized_valid_train.npy")
    feature_vector = train_data[columns.index("prop_location_score2"), :]
    score_vector = train_data[columns.index("booking_bool"), :]

    data_without_score = np.zeros(score_vector.shape) # Just for assigning the placeholder.
    data_without_score = np.vstack((data_without_score, feature_vector))

    train_data = score_vector
    train_data = np.vstack((train_data, data_without_score))

    train_data = np.transpose(train_data)
    data_without_score = np.transpose(data_without_score)

    for x in train_data:
		print "{0} qid:{1}".format(x[0],int(x[1])),
		for idx, val in enumerate(x[2:]):
			print "{0}:{1}".format(idx+1, val),
		print
Пример #6
0
def main():
    item_helper = ItemHelper()
    columns = item_helper.get_all_column_names_new()

    train_data = np.load(
        "data_numpy/combined_sampled_normalized_valid_train.npy")
    feature_vector = train_data[columns.index("prop_location_score2"), :]
    score_vector = train_data[columns.index("booking_bool"), :]

    data_without_score = np.zeros(
        score_vector.shape)  # Just for assigning the placeholder.
    data_without_score = np.vstack((data_without_score, feature_vector))

    train_data = score_vector
    train_data = np.vstack((train_data, data_without_score))

    train_data = np.transpose(train_data)
    data_without_score = np.transpose(data_without_score)

    for x in train_data:
        print "{0} qid:{1}".format(x[0], int(x[1])),
        for idx, val in enumerate(x[2:]):
            print "{0}:{1}".format(idx + 1, val),
        print
Пример #7
0
def convert_data_to_numpy(path, train_data):
    line_number = 0
    with open(path) as fp:
        temp = fp.readline().strip().split(
            ",")  # to ignore the header but to count the number of fields
        num_of_fields = len(temp)
        logger.info("Number of fields: {0}".format(num_of_fields))

        # DO NOT TRY TO UPDATE CONVERT EVERY ATTRIBUTE (IT WILL CONSUME HUGE MEMORY SPACE)
        need_to_be_convert = [
            #"srch_id",
            #"date_time",
            # "site_id",
            # "visitor_location_country_id",
            # "visitor_hist_starrating",
            # "visitor_hist_adr_usd",
            # "prop_country_id",
            # "prop_id",
            # "prop_starrating",
            # "prop_review_score",
            # "prop_brand_bool",
            # "prop_location_score1",
            # # "prop_location_score2",
            # "prop_log_historical_price",
            # "position",
            # "price_usd",
            # "promotion_flag",
            # "srch_destination_id",
            # "srch_length_of_stay",
            # "srch_booking_window",
            # "srch_adults_count",
            # "srch_children_count",
            # "srch_room_count",
            # "srch_saturday_night_bool",
            # "srch_query_affinity_score",
            # "orig_destination_distance",
            # "random_bool",
            # "comp1_rate",
            # "comp1_inv",
            # "comp1_rate_percent_diff",
            # "comp2_rate",
            # "comp2_inv",
            # "comp2_rate_percent_diff",
            # "comp3_rate",
            # "comp3_inv",
            # "comp3_rate_percent_diff",
            # "comp4_rate",
            # "comp4_inv",
            # "comp4_rate_percent_diff",
            # "comp5_rate",
            # "comp5_inv",
            # "comp5_rate_percent_diff",
            # "comp6_rate",
            # "comp6_inv",
            # "comp6_rate_percent_diff",
            # "comp7_rate",
            # "comp7_inv",
            # "comp7_rate_percent_diff",
            # "comp8_rate",
            # "comp8_inv",
            # "comp8_rate_percent_diff",
            "click_bool",
            # "gross_bookings_usd",
            "booking_bool",
        ]

        item_helper = ItemHelper()
        entire_data = []
        mask_entire_data = map(lambda x: item_helper.get_column_index_of(x),
                               need_to_be_convert)

        #for i in range(0,100000): # partial convertion
        #linebuf = fp.readline()
        for linebuf in fp:  # full convertion
            line_number = line_number + 1
            fields = linebuf.strip().split(",")

            if (line_number % 1000 == 0):
                print "Reading the line : {0}\r".format(line_number),

            if (len(fields) == num_of_fields):
                entire_data.append(map(lambda i: fields[i], mask_entire_data))
            else:
                logger.warning("Mismatching fields: {0}".format(fields))

        print ""

        np_array_entire_data = np.array(entire_data)
        for idx, val in enumerate(need_to_be_convert):
            selected = np_array_entire_data[:, idx]
            train_data.convert(val, selected, True)
            selected = None

        logger.info("Completed: {0}".format(line_number))
Пример #8
0
def convert_data_to_numpy(path, train_data):
    line_number = 0
    with open(path) as fp:
        temp = fp.readline().strip().split(",") # to ignore the header but to count the number of fields
        num_of_fields = len(temp)
        logger.info("Number of fields: {0}".format(num_of_fields))

        # DO NOT TRY TO UPDATE CONVERT EVERY ATTRIBUTE (IT WILL CONSUME HUGE MEMORY SPACE)
        need_to_be_convert = [
            #"srch_id",
            #"date_time",
            # "site_id",
            # "visitor_location_country_id",
            # "visitor_hist_starrating",
            # "visitor_hist_adr_usd",
            # "prop_country_id",
            # "prop_id",
            # "prop_starrating",
            # "prop_review_score",
            # "prop_brand_bool",
            # "prop_location_score1",
            # # "prop_location_score2",
            # "prop_log_historical_price",
            # "position",
            # "price_usd",
            # "promotion_flag",
            # "srch_destination_id",
            # "srch_length_of_stay",
            # "srch_booking_window",
            # "srch_adults_count",
            # "srch_children_count",
            # "srch_room_count",
            # "srch_saturday_night_bool",
            # "srch_query_affinity_score",
            # "orig_destination_distance",
            # "random_bool",
            # "comp1_rate",
            # "comp1_inv",
            # "comp1_rate_percent_diff",
            # "comp2_rate",
            # "comp2_inv",
            # "comp2_rate_percent_diff",
            # "comp3_rate",
            # "comp3_inv",
            # "comp3_rate_percent_diff",
            # "comp4_rate",
            # "comp4_inv",
            # "comp4_rate_percent_diff",
            # "comp5_rate",
            # "comp5_inv",
            # "comp5_rate_percent_diff",
            # "comp6_rate",
            # "comp6_inv",
            # "comp6_rate_percent_diff",
            # "comp7_rate",
            # "comp7_inv",
            # "comp7_rate_percent_diff",
            # "comp8_rate",
            # "comp8_inv",
            # "comp8_rate_percent_diff",
            "click_bool",
            # "gross_bookings_usd",
            "booking_bool",
        ]
        

        item_helper = ItemHelper()
        entire_data = []
        mask_entire_data = map(lambda x: item_helper.get_column_index_of(x), need_to_be_convert)

        #for i in range(0,100000): # partial convertion
            #linebuf = fp.readline()
        for linebuf in fp: # full convertion
            line_number = line_number + 1
            fields = linebuf.strip().split(",")

            if (line_number % 1000 == 0 ):
                print "Reading the line : {0}\r".format(line_number),

            if (len(fields)==num_of_fields):
                entire_data.append(map(lambda i: fields[i], mask_entire_data))
            else:
                logger.warning("Mismatching fields: {0}".format(fields))

        print ""

        np_array_entire_data = np.array(entire_data)
        for idx, val in enumerate(need_to_be_convert):
            selected = np_array_entire_data[:,idx]
            train_data.convert(val, selected, True)
            selected = None

        logger.info("Completed: {0}".format(line_number))
Пример #9
0
def extract_features(_data, _selected_features_index, _column_names):
    _features = _data[:, _selected_features_index]
    _feature_srch_id = _data[:, _column_names.index("srch_id")]
    _feature_hotel_id = _data[:, _column_names.index("prop_id")]
    _y_click = _data[:, _column_names.index("click_bool")]
    _y_book = _data[:, _column_names.index("booking_bool")]
    return _features, _feature_srch_id, _feature_hotel_id, _y_click, _y_book


# Load Data
# =========
path = os.getcwd()
data = np.load(path + '/validdata/combined_sampled_normalized_valid_train.npy')
data = np.transpose(data)

item_helper = ItemHelper()
column_names = item_helper.get_all_column_names_new()

import sys
num_of_selection = int(sys.argv[1])

data2 = data[num_of_selection:num_of_selection * 2, :]
data = data[:num_of_selection, :]
#data2=data

# %data2=data
# booked_index = data[:, column_names.index("booking_bool")]
# data_booked = data[booked_index==1,:]
# data_not_booked = data[booked_index==0,:]
# data_booked_selected1 = data_booked[:num_of_selection,:]
# data_booked_selected2 = data_booked[num_of_selection:num_of_selection*2,:]
Пример #10
0
def get_correlation_matrix():
    item_helper = ItemHelper()
    column_names = item_helper.get_all_column_names_new()
    for i in range(0,len(column_names)):
        for j in range(i+1,len(column_names)):
            print column_names[i], column_names[j]
Пример #11
0
def extract_features(_data, _selected_features_index, _column_names):
	_features = _data[:,_selected_features_index]
	_feature_srch_id = _data[:,_column_names.index("srch_id")]
	_feature_hotel_id = _data[:,_column_names.index("prop_id")]
	_y_click = _data[:, _column_names.index("click_bool")]
	_y_book = _data[:, _column_names.index("booking_bool")]
	return _features, _feature_srch_id, _feature_hotel_id, _y_click, _y_book

# Load Data
# =========
path=os.getcwd()
data=np.load(path+'/validdata/combined_sampled_normalized_valid_train.npy')
data=np.transpose(data)

item_helper = ItemHelper()
column_names = item_helper.get_all_column_names_new()



import sys
num_of_selection = int(sys.argv[1])

data2=data[num_of_selection:num_of_selection*2,:]
data=data[:num_of_selection,:]
#data2=data

# %data2=data
# booked_index = data[:, column_names.index("booking_bool")]
# data_booked = data[booked_index==1,:]
# data_not_booked = data[booked_index==0,:]