def map_str_to_digit(data, features, no_map_features, only_map_features=" ", label=" "): no_map_features_index = get_known_features_index(features, no_map_features) features_map_info = dict() fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index( features, fixed_str_features) only_map_features_index = range(len(features)) if not only_map_features == " ": only_map_features_index = get_known_features_index( features, only_map_features) for fea_pos in range(1, len(features)): if not fea_pos in no_map_features_index and fea_pos in only_map_features_index: map_info = OrderedDict() #feature_map_info = OrderedDict() fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index) # if this feature is a string value, just convert it to value if fea_val_cla["str_feature"]: data, map_info = map_str_feature_to_value( data, fea_pos, fea_val_cla) features_map_info[features[fea_pos]] = map_info #features_map_info[].append([feature_map_info]) digited_data = convert_to_numerical(data, features) return digited_data, features_map_info
def new_UserInfo_18(data, features): solved_features = ["UserInfo_18"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_18_bined" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): user_age = data[user, fea_indexs] if user_age < "22": feature_data[user, 0] = 0 elif user_age < "30": feature_data[user, 0] = 1 elif user_age < "40": feature_data[user, 0] = 2 elif user_age < "50": feature_data[user, 0] = 3 else: feature_data[user, 0] = 4 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def new_UserInfo_differ(data, features, key_features, feature_name, deleted_all=True): fea_indexs = get_known_features_index(features, key_features) new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if not len(set(list(data[user, fea_indexs]))) == 1: feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis=1) delete_feas = key_features[1:] if deleted_all: delete_feas = key_features new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = delete_feas) print(deleted) return new_data, new_features
def thirdParty_one_period_info(data, features, label, period_number): based_feature_name = "ThirdParty_Info_Period" solved_features = list() for i in range(1, 17): solved_feature = based_feature_name + period_number[0] + "_" + str(i) solved_features.append(solved_feature) indexs = get_known_features_index(features, solved_features) one_period_info = OrderedDict() one_period_info["missing count"] = 0 one_period_info["missing contain positive count"] = 0 one_period_info["missing indexs"] = list() one_period_info["missing indexs label"] = list() for user in range(data.shape[0]): sat_data = list(data[user, indexs]) if -1 in sat_data: one_period_info["missing count"] += 1 if label[user] == 1: one_period_info["missing contain positive count"] += 1 one_period_info["missing indexs"].append(user) one_period_info["missing indexs label"].append(label[user]) return one_period_info
def new_UserInfo_18(data, features): solved_features = ["UserInfo_18"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_18_bined" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): user_age = data[user, fea_indexs] if user_age < "22": feature_data[user, 0] = 0 elif user_age < "30": feature_data[user, 0] = 1 elif user_age < "40": feature_data[user, 0] = 2 elif user_age < "50": feature_data[user, 0] = 3 else: feature_data[user, 0] = 4 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def correlation_between_properties(data, features): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) title = list() title.append("features1") title.append("features2") title.append("calculate_method") title.append("cor") title.append("pval") save_result(title, "pearsonr_spearmanr_results.csv") save_result(title, "pearsonr_spearmanr_Strong_correlation.csv") for fea_pos in range(len(features)): for fea_pos_add in range(fea_pos + 1, len(features)): info_result = list() info_result.append(features[fea_pos]) info_result.append(features[fea_pos_add]) a1 = data[:, fea_pos] a2 = data[:, fea_pos_add] # they are all not str style features if fea_pos not in indexs and fea_pos_add not in indexs: info_result.append("pearsonr") cor, pval = stats.pearsonr(a1, a2) else: # one of them or all of them are str style features info_result.append("spearmanr") cor, pval = stats.spearmanr(a1, a2) cor = round(cor, 3) info_result.append(cor) info_result.append(pval) if abs(cor) >= 0.2: save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+") if abs(cor) >= 0.86: save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \ style = "a+")
def new_EI_5_6_7_8(data, features): key_features = ["Education_Info5", "Education_Info6", "Education_Info7", "Education_Info8"] fea_indexs = get_known_features_index(features, key_features) feature_name = "combine_EI_5_6_7_8" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [["1", "AQ", "-1", "T"], ["1", "AQ", "-1", "80"], ["1", "U", "-1", "-1"], ["1", "AQ", "-1", "-1"], ["1", "B", "-1", "-1"], ["1", "A", "-1", "-1"], ["1", "AM", "-1", "80"], ["1", "A", "-1", "F"], ["1", "B", "-1", "AE"], ["1", "U", "-1", "AE"], ["1", "AQ", "-1", "V"], ["1", "AM", "-1", "V"]] map_to_one = [["1", "A", "-1", "T"], ["1", "AQ", "-1", "F"], ["1", "AM", "-1", "-1"], ["1", "AM", "-1", "-1"], ["1", "AM", "-1", "F"], ["1", "AM", "-1", "T"]] map_to_two = [["0", "E", "E", "E"]] feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_5_6_7_8 = list(data[user, fea_indexs]) if EI_5_6_7_8 in map_to_zero: feature_data[user, 0] = 0 elif EI_5_6_7_8 in map_to_one: feature_data[user, 0] = 1 elif EI_5_6_7_8 in map_to_two: feature_data[user, 0] = 2 else: feature_data[user, 0] = 3 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features[1:]) print(deleted) return new_data, new_features
def new_EI_8(data, features): solved_features = ["Education_Info8"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "Education_Info8_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): if data[user, fea_indexs[0]] == "V" or data[user, fea_indexs[0]] == "AE": feature_data[user, 0] = 0 elif data[user, fea_indexs[0]] == "80": feature_data[user, 0] = 1 elif data[user, fea_indexs[0]] == "F": feature_data[user, 0] = 2 elif data[user, fea_indexs[0]] == "T": feature_data[user, 0] = 3 elif data[user, fea_indexs[0]] == "-1": feature_data[user, 0] = 4 elif data[user, fea_indexs[0]] == "E": feature_data[user, 0] = 5 else: print("error in Education_Info8") new_data = np.concatenate((data, feature_data), axis = 1) print("Education_Info8 solved") return new_data, new_features
def new_WI_19(data, features): solved_features = ["WeblogInfo_19"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_19_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): if data[user, fea_indexs[0]] == "H": feature_data[user, 0] = 0 elif data[user, fea_indexs[0]] == "G": feature_data[user, 0] = 1 elif data[user, fea_indexs[0]] == "J": feature_data[user, 0] = 2 elif data[user, fea_indexs[0]] == "E": feature_data[user, 0] = 3 elif data[user, fea_indexs[0]] == "F": feature_data[user, 0] = 4 elif data[user, fea_indexs[0]] == "D": feature_data[user, 0] = 5 else: feature_data[user, 0] = 6 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_19 solved") print(deleted) return new_data, new_features
def new_WI_19(data, features): solved_features = ["WeblogInfo_19"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_19_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): if data[user, fea_indexs[0]] == "H": feature_data[user, 0] = 0 elif data[user, fea_indexs[0]] == "G": feature_data[user, 0] = 1 elif data[user, fea_indexs[0]] == "J": feature_data[user, 0] = 2 elif data[user, fea_indexs[0]] == "E": feature_data[user, 0] = 3 elif data[user, fea_indexs[0]] == "F": feature_data[user, 0] = 4 elif data[user, fea_indexs[0]] == "D": feature_data[user, 0] = 5 else: feature_data[user, 0] = 6 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_19 solved") print(deleted) return new_data, new_features
def new_EI_1_2_3_4(data, features): key_features = ["Education_Info1", "Education_Info2", "Education_Info3", "Education_Info4"] fea_indexs = get_known_features_index(features, key_features) feature_name = "combine_EI_1_2_3_4" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"], ["1", "AN", "结业", "T"], ["1", "AM", "结业", "T"], ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"], ["1", "A", "毕业", "AR"]] map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"], ["1", "AM", "毕业", "V"], ["1", "AQ", "毕业", "F"], ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"], ["1", "AQ", "毕业", "V"]] map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"], ["1", "AM", "毕业", "F"]] map_to_three = [["0", "E", "E", "E"]] feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_1_2_3_4 = list(data[user, fea_indexs]) if EI_1_2_3_4 in map_to_zero: feature_data[user, 0] = 0 elif EI_1_2_3_4 in map_to_one: feature_data[user, 0] = 1 elif EI_1_2_3_4 in map_to_two: feature_data[user, 0] = 2 elif EI_1_2_3_4 in map_to_three: feature_data[user, 0] = 3 else: print("error!!!!") new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features[1:]) print(deleted) return new_data, new_features
def new_EI_8(data, features): solved_features = ["Education_Info8"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "Education_Info8_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): if data[user, fea_indexs[0]] == "V" or data[user, fea_indexs[0]] == "AE": feature_data[user, 0] = 0 elif data[user, fea_indexs[0]] == "80": feature_data[user, 0] = 1 elif data[user, fea_indexs[0]] == "F": feature_data[user, 0] = 2 elif data[user, fea_indexs[0]] == "T": feature_data[user, 0] = 3 elif data[user, fea_indexs[0]] == "-1": feature_data[user, 0] = 4 elif data[user, fea_indexs[0]] == "E": feature_data[user, 0] = 5 else: print("error in Education_Info8") new_data = np.concatenate((data, feature_data), axis=1) print("Education_Info8 solved") return new_data, new_features
def new_WI_20_by_present(data, features): solved_features = ["WeblogInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_20_present_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) none_finded_combine = OrderedDict() feature_data = np.zeros((len(data), 1)) map_to_zero = [ 'F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O', 'C13', 'I6', 'C16', 'I7', 'I10' ] map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15'] map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19'] map_to_three = ['I3', 'U', 'C21', 'I4'] map_to_four = ['I5'] map_to_five = ['-1'] for user in range(data.shape[0]): fea_value = data[user, fea_indexs[0]] if fea_value in map_to_zero: feature_data[user, 0] = 0 elif fea_value in map_to_one: feature_data[user, 0] = 1 elif fea_value in map_to_two: feature_data[user, 0] = 2 elif fea_value in map_to_three: feature_data[user, 0] = 3 elif fea_value in map_to_four: feature_data[user, 0] = 4 elif fea_value in map_to_five: feature_data[user, 0] = 5 else: # print("error") # print(fea_value) if fea_value not in none_finded_combine.keys(): none_finded_combine[fea_value] = list() none_finded_combine[fea_value].append(user) for fea_value, users in none_finded_combine.items(): if fea_value[0] == "-1": feature_data[users, 0] = 5 if len(users) < 20: feature_data[users, 0] = 0 elif len(users) < 100: feature_data[users, 0] = 1 elif len(users) < 1000: feature_data[users, 0] = 2 elif len(users) < 5000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_20 solved present") print(deleted) return new_data, new_features
def new_WI_20_by_present(data, features): solved_features = ["WeblogInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_20_present_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) none_finded_combine = OrderedDict() feature_data = np.zeros((len(data), 1)) map_to_zero = ['F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O', 'C13', 'I6', 'C16', 'I7', 'I10'] map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15'] map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19'] map_to_three = ['I3', 'U', 'C21', 'I4'] map_to_four = ['I5'] map_to_five = ['-1'] for user in range(data.shape[0]): fea_value = data[user, fea_indexs[0]] if fea_value in map_to_zero: feature_data[user, 0] = 0 elif fea_value in map_to_one: feature_data[user, 0] = 1 elif fea_value in map_to_two: feature_data[user, 0] = 2 elif fea_value in map_to_three: feature_data[user, 0] = 3 elif fea_value in map_to_four: feature_data[user, 0] = 4 elif fea_value in map_to_five: feature_data[user, 0] = 5 else: # print("error") # print(fea_value) if fea_value not in none_finded_combine.keys(): none_finded_combine[fea_value] = list() none_finded_combine[fea_value].append(user) for fea_value, users in none_finded_combine.items(): if fea_value[0] == "-1": feature_data[users, 0] = 5 if len(users) < 20: feature_data[users, 0] = 0 elif len(users) < 100: feature_data[users, 0] = 1 elif len(users) < 1000: feature_data[users, 0] = 2 elif len(users) < 5000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_20 solved present") print(deleted) return new_data, new_features
def extract_data_by_features(data, features, needed_features): needed_features_index = get_known_features_index(features, needed_features) new_data = np.ones((data.shape[0], len(needed_features)), dtype=np.int64) print(len(needed_features_index)) print(new_data.shape) for i in range(len(needed_features)): new_data[:, i] = data[:, needed_features_index[i]] new_features = needed_features return new_data, new_features
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \ contain_special_features): map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \ dir_name = "resultData/features_map") print(map_experience) fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index( features, fixed_str_features) digited_special_str_features_index = get_known_features_index(features, \ digited_special_str_features) contain_special_features_index = get_known_features_index(features, \ contain_special_features) remember = list() for fea_pos in range(1, len(features)): # str style features + str .. but not digited + the str we want to digit if fea_pos in fixed_str_features_index and \ fea_pos not in digited_special_str_features_index and \ fea_pos in contain_special_features_index: # the ListingInfo may be reverse !!! if features[fea_pos] == "ListingInfo" and int( data[0, fea_pos].split("/")[0]) < 1000: data = reverse_date(data, fea_pos) if features[fea_pos] in map_experience.keys(): for i in range(len(data)): if data[i, fea_pos] == "-1": continue try: data[i, fea_pos] = map_experience[features[fea_pos]][ data[i, fea_pos]] except: if i < 50: print(features[fea_pos]) print(map_experience[features[fea_pos]]) print(map_experience[features[fea_pos]][data[ i, fea_pos]]) remember.append(i) # this is a error value #print(remember) data = np.delete(data, remember, 0) digited_data = convert_to_numerical(data, features) return digited_data
def compare_features_info2(data, features, key_features): fea_indexs = get_known_features_index(features, key_features) compare_result = OrderedDict() for user in range(data.shape[0]): # user_id = data[user, 0] combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs])) if combine_data not in compare_result.keys(): compare_result[combine_data] = 0 compare_result[combine_data] += 1 return compare_result
def one_features_info2(data, features, key_feature): fea_index = get_known_features_index(features, key_feature) user_value_info = dict() for user in range(data.shape[0]): value = list(data[user, fea_index])[0] if not value in user_value_info.keys(): user_value_info[value] = 0 user_value_info[value] += 1 return user_value_info
def one_features_info(data, features, label, key_feature): fea_index = get_known_features_index(features, key_feature) user_value_info = dict() for user in range(data.shape[0]): value = list(data[user, fea_index])[0] if not value in user_value_info.keys(): user_value_info[value] = [0, 0] user_value_info[value][0] += 1 if label[user] == 1: user_value_info[value][1] += 1 return user_value_info
def fill_all_missing(data, features, label = None): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) #!start from range(1,...) is because the first line of the feature is the id, useless for fea_pos in range(1, len(features)): fea_val_cla = feature_value_class(data, fea_pos, label, indexs) if not fea_val_cla[-1]._present_num == 0: if fea_pos == 5: print(fea_val_cla) data = fill_the_missing(data, fea_pos, fea_val_cla, label) #write_to_deleted_features_area(np.array(deleted_feas)) return data, features
def according_coefficient_variation_delete(data, features): waiting_to_delete = np.array(load_result("complex_value_features.csv")) waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size,)) #print(waiting_to_delete) indexs = get_known_features_index(features, waiting_to_delete) coefficient_variation_info = OrderedDict() for fea_pos in indexs: try: coefficient_variation_fea = stats.variation(data[:, fea_pos]) coefficient_variation_info[features[fea_pos]] = coefficient_variation_fea except: pass return coefficient_variation_info
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \ contain_special_features): map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \ dir_name = "resultData/features_map") print(map_experience) fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index(features, fixed_str_features) digited_special_str_features_index = get_known_features_index(features, \ digited_special_str_features) contain_special_features_index = get_known_features_index(features, \ contain_special_features) remember = list() for fea_pos in range(1, len(features)): # str style features + str .. but not digited + the str we want to digit if fea_pos in fixed_str_features_index and \ fea_pos not in digited_special_str_features_index and \ fea_pos in contain_special_features_index: # the ListingInfo may be reverse !!! if features[fea_pos] == "ListingInfo" and int(data[0, fea_pos].split("/")[0]) < 1000: data = reverse_date(data, fea_pos) if features[fea_pos] in map_experience.keys(): for i in range(len(data)): if data[i, fea_pos] == "-1": continue try: data[i, fea_pos] = map_experience[features[fea_pos]][data[i, fea_pos]] except: if i < 50: print(features[fea_pos]) print(map_experience[features[fea_pos]]) print(map_experience[features[fea_pos]][data[i, fea_pos]]) remember.append(i) # this is a error value #print(remember) data = np.delete(data, remember, 0) digited_data = convert_to_numerical(data, features) return digited_data
def according_coefficient_variation_delete(data, features): waiting_to_delete = np.array(load_result("complex_value_features.csv")) waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size, )) #print(waiting_to_delete) indexs = get_known_features_index(features, waiting_to_delete) coefficient_variation_info = OrderedDict() for fea_pos in indexs: try: coefficient_variation_fea = stats.variation(data[:, fea_pos]) coefficient_variation_info[ features[fea_pos]] = coefficient_variation_fea except: pass return coefficient_variation_info
def missed_instances_info(data, features, label, key_feature = None): missed_infos = OrderedDict() users_miss_info = OrderedDict() if key_feature == None: indexs = range(len(features)) key_data = data key_feature = features else: indexs = get_known_features_index(features, key_feature) key_data = data[:, indexs] print(key_data.shape) print(key_data.shape[0]) print(key_data.shape[1]) missed_infos["missed_instances_sum"] = 0 missed_infos["positive_sum"] = 0 missed_infos["negitive_sum"] = 0 for i in range(key_data.shape[0]): user_miss_count = 0 missed_features_index = list() missed_features = list() flag = 0 user_idx = data[i, 0] users_miss_info[user_idx] = OrderedDict() for j in range(key_data.shape[1]): if key_data[i, j] == -1: flag = 1 user_miss_count += 1 missed_features_index.append(indexs[j]) missed_features.append(key_feature[j]) # exist miss in this line if flag == 1: missed_infos["missed_instances_sum"] += 1 if label[i] == 1: missed_infos["positive_sum"] += 1 else: missed_infos["negitive_sum"] += 1 # if this uer missed, statistic information if user_miss_count: users_miss_info[user_idx]["missed_count"] = user_miss_count users_miss_info[user_idx]["miss_features_indexs"] = missed_features_index users_miss_info[user_idx]["missed_features"] = missed_features users_miss_info[user_idx]["label"] = label[i] return missed_infos, users_miss_info
def map_str_to_digit(data, features, no_map_features, only_map_features = " ", label = " "): no_map_features_index = get_known_features_index(features, no_map_features) features_map_info = dict() fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index(features, fixed_str_features) only_map_features_index = range(len(features)) if not only_map_features == " ": only_map_features_index = get_known_features_index(features, only_map_features) for fea_pos in range(1, len(features)): if not fea_pos in no_map_features_index and fea_pos in only_map_features_index: map_info = OrderedDict() #feature_map_info = OrderedDict() fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index) # if this feature is a string value, just convert it to value if fea_val_cla["str_feature"]: data, map_info = map_str_feature_to_value(data, fea_pos, fea_val_cla) features_map_info[features[fea_pos]] = map_info #features_map_info[].append([feature_map_info]) digited_data = convert_to_numerical(data, features) return digited_data, features_map_info
def sta_thirdParty_info(data, features, type_number, label = None): based_feature_name = "ThirdParty_Info_Period" solved_features = list() for i in range(1, 7): solved_feature = based_feature_name + str(i) + "_" + type_number[0] solved_features.append(solved_feature) indexs = get_known_features_index(features, solved_features) # print(solved_features) # print(indexs) users_sta_name = ["average", "Standard deviation", "coefficient of variation", "max", "min"] sta_name = "ms_type" + type_number[0] #print("sta_name: ", sta_name) users_stability = OrderedDict() for user in range(data.shape[0]): users_stability[user] = OrderedDict() if not label == None: users_stability[user]["label"] = label[user] calculate_data = data[user, indexs] users_stability[user]["value"] = list(calculate_data) users_stability[user][sta_name] = dict() fea_average = round(float(np.mean(calculate_data)), 3) fea_std = round(float(np.std(calculate_data)), 3) if fea_average == 0: fea_cv = 0 else: fea_cv = round(float(fea_std / fea_average), 3) max_v = np.amax(calculate_data) min_v = np.amin(calculate_data) users_stability[user][sta_name]["average"] = fea_average users_stability[user][sta_name]["Standard deviation"] = fea_std users_stability[user][sta_name]["cv"] = fea_cv users_stability[user][sta_name]["max"] = max_v users_stability[user][sta_name]["min"] = min_v return users_stability
def new_UserInfo_19_20(data, features): solved_features = ["UserInfo_19", "UserInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_19_20_wrong_province_city" new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if str(-1) in list(data[user, fea_indexs]): feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def new_UserInfo_19_20(data, features): solved_features = ["UserInfo_19", "UserInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_19_20_wrong_province_city" new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if str(-1) in list(data[user, fea_indexs]): feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def new_EI_1_2_3_4(data, features): key_features = [ "Education_Info1", "Education_Info2", "Education_Info3", "Education_Info4" ] fea_indexs = get_known_features_index(features, key_features) feature_name = "combine_EI_1_2_3_4" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"], ["1", "AN", "结业", "T"], ["1", "AM", "结业", "T"], ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"], ["1", "A", "毕业", "AR"]] map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"], ["1", "AM", "毕业", "V"], ["1", "AQ", "毕业", "F"], ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"], ["1", "AQ", "毕业", "V"]] map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"], ["1", "AM", "毕业", "F"]] map_to_three = [["0", "E", "E", "E"]] feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_1_2_3_4 = list(data[user, fea_indexs]) if EI_1_2_3_4 in map_to_zero: feature_data[user, 0] = 0 elif EI_1_2_3_4 in map_to_one: feature_data[user, 0] = 1 elif EI_1_2_3_4 in map_to_two: feature_data[user, 0] = 2 elif EI_1_2_3_4 in map_to_three: feature_data[user, 0] = 3 else: print("error!!!!") new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features[1:]) print(deleted) return new_data, new_features
def compare_features_info(data, features, label, key_features): fea_indexs = get_known_features_index(features, key_features) compare_result = OrderedDict() compare_result["num_differ"] = 0 compare_result["different_combine_info"] = dict() compare_result["num_differ_positive"] = 0 compare_result["num_same"] = 0 compare_result["same_combine_info"] = dict() compare_result["num_same_positive"] = 0 compare_result["num_same_miss"] = 0 for user in range(data.shape[0]): # user_id = data[user, 0] combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs])) province = list(data[user, fea_indexs])[0] pro_info = list(data[user, fea_indexs])[1] if not len(set(list(data[user, fea_indexs]))) == 1: compare_result["num_differ"] += 1 if not combine_data in compare_result["different_combine_info"].keys(): compare_result["different_combine_info"][combine_data] = [0, 0] compare_result["different_combine_info"][combine_data][0] += 1 if label[user] == 1: compare_result["num_differ_positive"] += 1 compare_result["different_combine_info"][combine_data][1] += 1 else: compare_result["num_same"] += 1 if not combine_data in compare_result["same_combine_info"].keys(): compare_result["same_combine_info"][combine_data] = [0, 0] compare_result["same_combine_info"][combine_data][0] += 1 if label[user] == 1: compare_result["num_same_positive"] += 1 compare_result["same_combine_info"][combine_data][1] += 1 if str(-1) == list(data[user, fea_indexs])[0]: compare_result["num_same_miss"] += 1 return compare_result
def new_UserInfo_11_12_13(data, features): solved_features = ["UserInfo_11", "UserInfo_12", "UserInfo_13"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_11_12_13_info" new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): combine_data = list(map(int, list(data[user, fea_indexs]))) add = reduce(lambda x, y: x * 2 + y, combine_data) if add > 0: feature_data[user, 0] = add else: feature_data[user, 0] = 0 new_data = np.concatenate((data, feature_data), axis = 1) print("extract from UserInfo 11 12 13") return new_data, new_features
def new_UserInfo_11_12_13(data, features): solved_features = ["UserInfo_11", "UserInfo_12", "UserInfo_13"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_11_12_13_info" new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): combine_data = list(map(int, list(data[user, fea_indexs]))) add = reduce(lambda x, y: x * 2 + y, combine_data) if add > 0: feature_data[user, 0] = add else: feature_data[user, 0] = 0 new_data = np.concatenate((data, feature_data), axis=1) print("extract from UserInfo 11 12 13") return new_data, new_features
def correlation_between_properties(data, features): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) title = list() title.append("features1") title.append("features2") title.append("calculate_method") title.append("cor") title.append("pval") save_result(title, "pearsonr_spearmanr_results.csv") save_result(title, "pearsonr_spearmanr_Strong_correlation.csv") for fea_pos in range(len(features)): for fea_pos_add in range(fea_pos + 1, len(features)): info_result = list() info_result.append(features[fea_pos]) info_result.append(features[fea_pos_add]) a1 = data[:, fea_pos] a2 = data[:, fea_pos_add] # they are all not str style features if fea_pos not in indexs and fea_pos_add not in indexs: info_result.append("pearsonr") cor, pval = stats.pearsonr(a1, a2) else: # one of them or all of them are str style features info_result.append("spearmanr") cor, pval = stats.spearmanr(a1, a2) cor = round(cor, 3) info_result.append(cor) info_result.append(pval) if abs(cor) >= 0.2: save_result(info_result, "pearsonr_spearmanr_results.csv", style="a+") if abs(cor) >= 0.86: save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \ style = "a+")
def new_UserInfo_differ(data, features, key_features, feature_name, deleted_all = True): fea_indexs = get_known_features_index(features, key_features) new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if not len(set(list(data[user, fea_indexs]))) == 1: feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis = 1) delete_feas = key_features[1:] if deleted_all: delete_feas = key_features new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = delete_feas) print(deleted) return new_data, new_features
def save_features_info(data, features, label, file_name, dir_name="resultData"): file_path = os.path.join(os.getcwd(), dir_name, file_name) first_line = np.array(['features_name', 'str_feature', \ 'num_values', \ 'average|most_presentS', 'postitive(average|most_present)', \ 'negitive(average|most_present)', \ 'num_positive', \ 'num_negitive', 'feature_value_info']) fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) with open(file_path, "w", newline='') as csv_file: spamwriter = csv.writer(csv_file) spamwriter.writerow(first_line) from solve_data import feature_value_class, FeatureInData from collections import OrderedDict for fea_pos in range(1, len(features)): feature_info = list() feature_info.append(features[fea_pos]) fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index=indexs) feature_info.append(fea_val_cla["str_feature"]) feature_info.append(fea_val_cla["num_of_value"]) if fea_val_cla["str_feature"]: feature_info.append(fea_val_cla["most_presentS"]) try: l = label[0, 0] feature_info.append(fea_val_cla["most_presentS_positive"]) feature_info.append(fea_val_cla["most_presentS_negitive"]) except: feature_info.append("None") feature_info.append("None") else: feature_info.append(fea_val_cla["average"]) try: l = label[0, 0] feature_info.append(fea_val_cla["average_positive"]) feature_info.append(fea_val_cla["average_negitive"]) except: feature_info.append("None") feature_info.append("None") try: l = label[0, 0] feature_info.append(fea_val_cla["num_positive"]) feature_info.append(fea_val_cla["num_negitive"]) except: feature_info.append("None") feature_info.append("None") for k, v in fea_val_cla.items(): if isinstance(v, FeatureInData): value_info = OrderedDict() value_info["value"] = k value_info["present_num"] = v._present_num try: l = label[0, 0] value_info[ "respond_positive_num"] = v._respond_positive_num value_info[ "respond_negitive_num"] = v._respond_negitive_num except: pass feature_info.append(value_info) with open(file_path, "a+", newline='') as csv_file: spamwriter = csv.writer(csv_file) spamwriter.writerow(feature_info)
def new_UserInfo_22_23_combine2(data, features): key_features = ["UserInfo_22", "UserInfo_23"] print("combine2") fea_indexs = get_known_features_index(features, key_features) feature_name = "UserInfo_combine2_by_present_22_23" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'], ['未婚', 'AJ'], ['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'], ['未婚', 'AP'], ['再婚', 'G'], ['未婚', 'R'], ['已婚', 'AL'], ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], ['未婚', 'AD'], ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'], ['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'], ['已婚', 'Z'], ['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'], ['-1', 'AI'], ['离婚', '-1'], ['已婚', '-1'], ['再婚', 'H'], ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], ['离婚', 'M'], ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'], ['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'], ['已婚', 'AC'], ['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']] map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'], ['已婚', '专科毕业']] map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'], ['已婚', '大学本科(简称“大学'], ['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'], ['-1', 'O'], ['已婚', 'AK'], ['未婚', 'AI'], ['未婚', 'M'], ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], ['已婚', 'H'], ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']] map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'], ['未婚', 'AB'], ['未婚', 'G']] map_to_four = [['D', 'D']] map_to_five = [['-1', '-1']] none_finded_combine = OrderedDict() feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_22_23 = list(data[user, fea_indexs]) if EI_22_23 in map_to_zero: feature_data[user, 0] = 0 elif EI_22_23 in map_to_one: feature_data[user, 0] = 1 elif EI_22_23 in map_to_two: feature_data[user, 0] = 2 elif EI_22_23 in map_to_three: feature_data[user, 0] = 3 elif EI_22_23 in map_to_four: feature_data[user, 0] = 4 elif EI_22_23 in map_to_five: feature_data[user, 0] = 5 else: EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23) if EI_22_23_str not in none_finded_combine.keys(): none_finded_combine[EI_22_23_str] = list() none_finded_combine[EI_22_23_str].append(user) for EI_combine, users in none_finded_combine.items(): EI_combine = EI_combine.split("_") if EI_combine[0] == "-1" and EI_combine[1] == "-1": feature_data[users, 0] = 5 if len(users) < 10: feature_data[users, 0] = 0 elif len(users) < 20: feature_data[users, 0] = 1 elif len(users) < 100: feature_data[users, 0] = 2 elif len(users) < 1000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features) print(deleted) return new_data, new_features
def save_features_info(data, features, label, file_name, dir_name = "resultData"): file_path = os.path.join(os.getcwd(), dir_name, file_name) first_line = np.array(['features_name', 'str_feature', \ 'num_values', \ 'average|most_presentS', 'postitive(average|most_present)', \ 'negitive(average|most_present)', \ 'num_positive', \ 'num_negitive', 'feature_value_info']) fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) with open(file_path, "w", newline='') as csv_file: spamwriter = csv.writer(csv_file) spamwriter.writerow(first_line) from solve_data import feature_value_class, FeatureInData from collections import OrderedDict for fea_pos in range(1, len(features)): feature_info = list() feature_info.append(features[fea_pos]) fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index = indexs) feature_info.append(fea_val_cla["str_feature"]) feature_info.append(fea_val_cla["num_of_value"]) if fea_val_cla["str_feature"]: feature_info.append(fea_val_cla["most_presentS"]) try: l = label[0, 0] feature_info.append(fea_val_cla["most_presentS_positive"]) feature_info.append(fea_val_cla["most_presentS_negitive"]) except: feature_info.append("None") feature_info.append("None") else: feature_info.append(fea_val_cla["average"]) try: l = label[0, 0] feature_info.append(fea_val_cla["average_positive"]) feature_info.append(fea_val_cla["average_negitive"]) except: feature_info.append("None") feature_info.append("None") try: l = label[0, 0] feature_info.append(fea_val_cla["num_positive"]) feature_info.append(fea_val_cla["num_negitive"]) except: feature_info.append("None") feature_info.append("None") for k, v in fea_val_cla.items(): if isinstance(v, FeatureInData): value_info = OrderedDict() value_info["value"] = k value_info["present_num"] = v._present_num try: l = label[0, 0] value_info["respond_positive_num"] = v._respond_positive_num value_info["respond_negitive_num"] = v._respond_negitive_num except: pass feature_info.append(value_info) with open(file_path, "a+", newline='') as csv_file: spamwriter = csv.writer(csv_file) spamwriter.writerow(feature_info)
def new_UserInfo_22_23_combine2(data, features): key_features = ["UserInfo_22", "UserInfo_23"] print("combine2") fea_indexs = get_known_features_index(features, key_features) feature_name = "UserInfo_combine2_by_present_22_23" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'], ['未婚', 'AJ'], ['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'], ['未婚', 'AP'], ['再婚', 'G'], ['未婚', 'R'], ['已婚', 'AL'], ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], ['未婚', 'AD'], ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'], ['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'], ['已婚', 'Z'], ['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'], ['-1', 'AI'], ['离婚', '-1'], ['已婚', '-1'], ['再婚', 'H'], ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], ['离婚', 'M'], ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'], ['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'], ['已婚', 'AC'], ['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']] map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'], ['已婚', '专科毕业']] map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'], ['已婚', '大学本科(简称“大学'], ['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'], ['-1', 'O'], ['已婚', 'AK'], ['未婚', 'AI'], ['未婚', 'M'], ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], ['已婚', 'H'], ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']] map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'], ['未婚', 'AB'], ['未婚', 'G']] map_to_four = [['D', 'D']] map_to_five = [['-1', '-1']] none_finded_combine = OrderedDict() feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_22_23 = list(data[user, fea_indexs]) if EI_22_23 in map_to_zero: feature_data[user, 0] = 0 elif EI_22_23 in map_to_one: feature_data[user, 0] = 1 elif EI_22_23 in map_to_two: feature_data[user, 0] = 2 elif EI_22_23 in map_to_three: feature_data[user, 0] = 3 elif EI_22_23 in map_to_four: feature_data[user, 0] = 4 elif EI_22_23 in map_to_five: feature_data[user, 0] = 5 else: EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23) if EI_22_23_str not in none_finded_combine.keys(): none_finded_combine[EI_22_23_str] = list() none_finded_combine[EI_22_23_str].append(user) for EI_combine, users in none_finded_combine.items(): EI_combine = EI_combine.split("_") if EI_combine[0] == "-1" and EI_combine[1] == "-1": feature_data[users, 0] = 5 if len(users) < 10: feature_data[users, 0] = 0 elif len(users) < 20: feature_data[users, 0] = 1 elif len(users) < 100: feature_data[users, 0] = 2 elif len(users) < 1000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features) print(deleted) return new_data, new_features