def replace_miss(data, features, label = "", for_train = True): delete_fea_index = [] missing_num = [] new_data = data.copy() new_features = features.copy() if for_train: SAVE_DIR = "resultData" else: SAVE_DIR = "resultData/test/" #!start from range(1,...) is because the first line of the feature is the id, useless for fea_pos in range(1, len(features)): fea_val_cla = feature_value_class(data, fea_pos, label) if not fea_val_cla[-1]._present_num == 0: new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \ delete_fea_index, missing_num) if for_train: new_data, new_features, deleted_feas = delete_features(new_data, new_features, \ delete_fea_pos = delete_fea_index) save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features) save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \ np.array(deleted_feas), dir_name = SAVE_DIR) return new_data, new_features
def use_PCA_to_delete(data, features, needed_delete_featuers): stored_features = dict() for fea in needed_delete_featuers: stored = list() print("now!:", fea) fea_index = find_featuers_index(fea, features) print("finded: ", fea_index) delete_features_data = data[:, fea_index] from sklearn import decomposition pca = decomposition.PCA() pca.fit(delete_features_data) result = pca.explained_variance_ print(result) mean = np.mean(result) print("mean:", mean) stored = [features[fea_index[i]] for i in range(len(result)) \ if result[i] >= mean] #print(stored) save_result(stored, "after_deleted_by_pca.csv", style="a+") stored_features[fea] = stored print(stored_features) return stored_features
def use_PCA_to_delete(data, features, needed_delete_featuers): stored_features = dict() for fea in needed_delete_featuers: stored = list() print("now!:", fea) fea_index = find_featuers_index(fea, features) print("finded: ", fea_index) delete_features_data = data[:, fea_index] from sklearn import decomposition pca = decomposition.PCA() pca.fit(delete_features_data) result = pca.explained_variance_ print(result) mean = np.mean(result) print("mean:", mean) stored = [features[fea_index[i]] for i in range(len(result)) \ if result[i] >= mean] #print(stored) save_result(stored, "after_deleted_by_pca.csv", style = "a+") stored_features[fea] = stored print(stored_features) return stored_features
def replace_miss(data, features, label = "", for_train = True, is_round_two = False): delete_fea_index = [] missing_num = [] new_data = data.copy() new_features = features.copy() if for_train: if is_round_two: SAVE_DIR = "resultData_two" else: SAVE_DIR = "resultData" else: if is_round_two: SAVE_DIR = "resultData_two/test/" else: SAVE_DIR = "resultData/test/" threshold = int(data.shape[0] * 2 / 3) print("threshold: ", threshold) #!start from range(1,...) is because the first line of the feature is the id, useless for fea_pos in range(1, len(features)): fea_val_cla = feature_value_class(data, fea_pos, label) if not fea_val_cla[-1]._present_num == 0: new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \ delete_fea_index, missing_num, threshold) if for_train and not is_round_two: new_data, new_features, deleted_feas = delete_features(new_data, new_features, \ delete_fea_pos = delete_fea_index) print("delete while training: ", deleted_feas) #save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features) save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \ np.array(deleted_feas), dir_name = SAVE_DIR) return new_data, new_features
def calculate_draw_roc(classifier, data, features, label, cv_Flod, original_data, original_label): mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] my_test = original_data[:3000] my_label = original_label[:3000] features_importance = dict() for i, (train, test) in enumerate(cv_Flod): fitted_classifier = classifier.fit(data[train], label[train]) probas_ = fitted_classifier.predict_proba(data[test]) if i == 1: save_result(probas_, "predict_result.csv") save_result(label[test], "original_result.csv") # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(label[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) importances = fitted_classifier.feature_importances_ indices = np.argsort(importances)[::-1] print("Feature ranking: ") for f in range(data.shape[1]): print("%s. %d (%f)" % (features[indices[f]], indices[f], importances[indices[f]])) features_importance[features[indices[f]]] = importances[indices[f]] test_probs = fitted_classifier.predict_proba(my_test) test_fpr, test_tpr, test_thresholds = roc_curve(my_label, test_probs[:, 1]) roc_auc = auc(test_fpr, test_tpr) plt.plot(test_fpr, test_tpr, lw=1, label='ROC test (area = %0.2f)' % (roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv_Flod) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.savefig("ROC_GB_user_all_solved_lr(0.05).png") return features_importance
def combine_land_modify_infos(data, log_info_data, update_info_data, saved_dir="resultData"): for_train = False # this is a data from test data if data.shape[1] == 2: for_train = True all_id_info = OrderedDict() for id_pos in range(len(data)): id_name = data[id_pos, 0] all_id_info[id_name] = OrderedDict() if for_train: all_id_info[id_name]["target"] = None else: all_id_info[id_name]["target"] = data[id_pos, 1] splited_date = data[id_pos, -1].split("/") # is the date`s style is 02/3/2014 if int(splited_date[-1]) > int(splited_date[0]): # convert it to 2014/3/02 t = splited_date[-1] splited_date[-1] = splited_date[0] splited_date[0] = t data[id_pos, -1] = splited_date[0] + "/" + splited_date[ 1] + "/" + splited_date[-1] all_id_info[id_name]["borrow_success_date"] = data[id_pos, -1] # add the land info all_id_info[id_name]["land_info"] = OrderedDict() land_date = list() land_operate_code = list() land_operate_style = list() id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0] for i in id_index_in_land: land_operate_code.append(log_info_data[i, 2]) land_operate_style.append(log_info_data[i, 3]) land_date.append(log_info_data[i, 4]) all_id_info[id_name]["land_info"][ "land_operate_code"] = land_operate_code all_id_info[id_name]["land_info"][ "land_operate_style"] = land_operate_style all_id_info[id_name]["land_info"]["land_date"] = land_date # add the modify info all_id_info[id_name]["modify_info"] = OrderedDict() modify_info = list() modify_date = list() id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0] for i in id_index_in_modify: modify_info.append(update_info_data[i, 2]) modify_date.append(update_info_data[i, 3]) all_id_info[id_name]["modify_info"]["modify_things"] = modify_info all_id_info[id_name]["modify_info"]["modify_date"] = modify_date save_result(all_id_info, "all_id_info.pickle", dir_name=saved_dir)
def combine_land_modify_infos(data, log_info_data, update_info_data, for_train, LU_info_file, saved_dir): all_id_info = OrderedDict() for id_pos in range(len(data)): id_name = data[id_pos, 0] all_id_info[id_name] = OrderedDict() if not for_train: all_id_info[id_name]["target"] = None else: all_id_info[id_name]["target"] = data[id_pos, 1] data[id_pos, -1] = rule_the_date_style(data[id_pos, -1]) # splited_date = data[id_pos, -1].split("/") # # it is not splited by "/", so we try another "-" # if len(splited_date) == 1: # splited_date = data[id_pos, -1].split("-") # # is the date`s style is 02/3/2014 # if int(splited_date[-1]) > int(splited_date[0]): # # convert it to 2014/3/02 # t = splited_date[-1] # splited_date[-1] = splited_date[0] # splited_date[0] = t # data[id_pos, -1] = splited_date[0] + "/" + splited_date[1] + "/" + splited_date[-1] all_id_info[id_name]["borrow_success_date"] = data[id_pos, -1] # add the land info all_id_info[id_name]["land_info"] = OrderedDict() land_date = list() land_operate_code = list() land_operate_style = list() id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0] for i in id_index_in_land: land_operate_code.append(log_info_data[i, 2]) land_operate_style.append(log_info_data[i, 3]) land_date.append(rule_the_date_style(log_info_data[i, 4])) all_id_info[id_name]["land_info"]["land_operate_code"] = land_operate_code all_id_info[id_name]["land_info"]["land_operate_style"] = land_operate_style all_id_info[id_name]["land_info"]["land_date"] = land_date # add the modify info all_id_info[id_name]["modify_info"] = OrderedDict() modify_info = list() modify_date =list() id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0] for i in id_index_in_modify: modify_info.append(update_info_data[i, 2]) modify_date.append(rule_the_date_style(update_info_data[i, 3])) all_id_info[id_name]["modify_info"]["modify_things"] = modify_info all_id_info[id_name]["modify_info"]["modify_date"] = modify_date save_result(all_id_info, LU_info_file, dir_name = saved_dir)
def solve_weblog_info_package(data, features, saved_dir = "resultData/"): from map_features_to_digit import convert_to_numerical from solve_data import delete_features data, features = new_WI_19(data, features) data, features = new_WI_20_by_present(data, features) #data, features = new_WI_20_by_positive(data, features) data, features = new_WI_21(data, features) # save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir) save_result(data, "data_after_solved_weblog.csv", features, dir_name = saved_dir) return data, features
def pipeline_for_features_solved(for_train = True, is_round_two = False): if not for_train and not is_round_two: print("I f**k your mother, cao ni ma de !! SB!!!!!") return 0 if for_train: print("**************** Train ************************") if is_round_two: print("******* Round Two *********") data_file_name = "Kesci_Master_9w_gbk_3_2.csv" saved_area = "resultData_two" else: data_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv" saved_area = "resultData" else: print("**************** Test ************************") if is_round_two: print("******* Round Two *********") data_file_name = "Kesci_Master_9w_gbk_1_test_set.csv" saved_area = "resultData_two/test" else: data_file_name = "PPD_Master_GBK_2_Test_Set.csv" saved_area = "resultData/test" data, features, label = load_data_for_solve(data_file_name, for_train, is_round_two) data, features = replace_miss(data, features, label, for_train, is_round_two) if not for_train or is_round_two: print("all deleted: ") deleted_features_in_train = load_all_deleted_features_during_train(is_round_two = False, deleted_features_file_label = "deleted_") #print(deleted_features_in_train) data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train) print(deleted) data, features = solve_user_info_package(data, features, saved_dir = saved_area) #save_result(data, "after_solve_user_info.csv", features, dir_name = saved_area) data, features = solve_weblog_info_package(data, features, saved_dir = saved_area) if for_train and not is_round_two: data, features = deleted_web_log_features(data, features, saved_dir = saved_area) data, features = solve_thirdparty_info_package(data, features, saved_dir = saved_area) data, features = extract_log_update_package(data, features, for_train, is_round_two) save_result(data, "data_after_features_processed.csv", features, dir_name = saved_area) print("****** all finished *********") print("size: (data, features)") print(data.shape) return data, features
def combine_land_modify_infos(data, log_info_data, update_info_data, saved_dir = "resultData"): for_train = False # this is a data from test data if data.shape[1] == 2: for_train = True all_id_info = OrderedDict() for id_pos in range(len(data)): id_name = data[id_pos, 0] all_id_info[id_name] = OrderedDict() if for_train: all_id_info[id_name]["target"] = None else: all_id_info[id_name]["target"] = data[id_pos, 1] splited_date = data[id_pos, -1].split("/") # is the date`s style is 02/3/2014 if int(splited_date[-1]) > int(splited_date[0]): # convert it to 2014/3/02 t = splited_date[-1] splited_date[-1] = splited_date[0] splited_date[0] = t data[id_pos, -1] = splited_date[0] + "/" + splited_date[1] + "/" + splited_date[-1] all_id_info[id_name]["borrow_success_date"] = data[id_pos, -1] # add the land info all_id_info[id_name]["land_info"] = OrderedDict() land_date = list() land_operate_code = list() land_operate_style = list() id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0] for i in id_index_in_land: land_operate_code.append(log_info_data[i, 2]) land_operate_style.append(log_info_data[i, 3]) land_date.append(log_info_data[i, 4]) all_id_info[id_name]["land_info"]["land_operate_code"] = land_operate_code all_id_info[id_name]["land_info"]["land_operate_style"] = land_operate_style all_id_info[id_name]["land_info"]["land_date"] = land_date # add the modify info all_id_info[id_name]["modify_info"] = OrderedDict() modify_info = list() modify_date =list() id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0] for i in id_index_in_modify: modify_info.append(update_info_data[i, 2]) modify_date.append(update_info_data[i, 3]) all_id_info[id_name]["modify_info"]["modify_things"] = modify_info all_id_info[id_name]["modify_info"]["modify_date"] = modify_date save_result(all_id_info, "all_id_info.pickle", dir_name = saved_dir)
def solve_thirdparty_info_package(data, features, saved_dir = "resultData/"): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = sta_start_missing_period(data, features) data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir) return data, features
def correlation_between_properties(data, features): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) title = list() title.append("features1") title.append("features2") title.append("calculate_method") title.append("cor") title.append("pval") save_result(title, "pearsonr_spearmanr_results.csv") save_result(title, "pearsonr_spearmanr_Strong_correlation.csv") for fea_pos in range(len(features)): for fea_pos_add in range(fea_pos + 1, len(features)): info_result = list() info_result.append(features[fea_pos]) info_result.append(features[fea_pos_add]) a1 = data[:, fea_pos] a2 = data[:, fea_pos_add] # they are all not str style features if fea_pos not in indexs and fea_pos_add not in indexs: info_result.append("pearsonr") cor, pval = stats.pearsonr(a1, a2) else: # one of them or all of them are str style features info_result.append("spearmanr") cor, pval = stats.spearmanr(a1, a2) cor = round(cor, 3) info_result.append(cor) info_result.append(pval) if abs(cor) >= 0.2: save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+") if abs(cor) >= 0.86: save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \ style = "a+")
def use_RandomForestRegressor_to_delete(data, features, label): from sklearn.cross_validation import cross_val_score, ShuffleSplit from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=50, max_depth=4) scores = [] deleted_features = list() for i in range(1, data.shape[1]): score = cross_val_score(rf, data[:, i:i+1], label, scoring="r2", cv=ShuffleSplit(len(data), 3, .3)) scores.append((round(np.mean(score), 3), features[i])) if round(np.mean(score), 3) < 0.01: deleted_features.append({features[i]:round(np.mean(score), 3)}) save_result(deleted_features, "RandomForestRegressor_delete_result.csv") print(sorted(scores, reverse=True))
def remove_no_discrimination(data, features, label): try: a = label.shape except: return data, features index_entroy = sort_features_with_entroy(data, features, label) new_data, new_features, deleted_features, delete_fea_entroy = delete_no_discrimination_features(data, features, index_entroy) save_result(np.array(delete_fea_entroy), "deleted_features_with_no_discrimination(entroy).csv", \ np.array(deleted_features)) # save_result(new_data, "data_after_delete_no_discrimination_features.csv", \ # new_features) # save_features_info(new_data, new_features, label, "infos_after_delete_features.csv") #write_to_deleted_features_area(np.array(deleted_features)) return new_data, new_features
def solve_weblog_info_package(data, features, saved_dir="resultData/"): from map_features_to_digit import convert_to_numerical from solve_data import delete_features data, features = new_WI_19(data, features) data, features = new_WI_20_by_present(data, features) #data, features = new_WI_20_by_positive(data, features) data, features = new_WI_21(data, features) # save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir) save_result(data, "data_after_solved_weblog.csv", features, dir_name=saved_dir) return data, features
def solve_thirdparty_info_package(data, features, saved_dir="resultData/"): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = sta_start_missing_period(data, features) data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name=saved_dir) return data, features
def use_RandomForestRegressor_to_delete(data, features, label): from sklearn.cross_validation import cross_val_score, ShuffleSplit from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=50, max_depth=4) scores = [] deleted_features = list() for i in range(1, data.shape[1]): score = cross_val_score(rf, data[:, i:i + 1], label, scoring="r2", cv=ShuffleSplit(len(data), 3, .3)) scores.append((round(np.mean(score), 3), features[i])) if round(np.mean(score), 3) < 0.01: deleted_features.append({features[i]: round(np.mean(score), 3)}) save_result(deleted_features, "RandomForestRegressor_delete_result.csv") print(sorted(scores, reverse=True))
def load_data_for_solve(file_name, for_train = True): label = "" if for_train: SAVE_DIR = "resultData/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name) original_features, original_data, train_label = extract_target(original_features, original_data) label = train_label.copy() save_result(label, "train_label_original.csv", dir_name = SAVE_DIR) else: SAVE_DIR = "resultData/test/" # for testing or else, load the data from other place original_features, original_data, original = load_data(file_name, data_style = "Test Set") #print(deleted_features) data = original_data.copy() features = original_features.copy() save_result(data, "withoutLabel_originalData.csv", features, dir_name = SAVE_DIR) return data, features, label
def submit(test_predict, save_dir): ###################################### Idx ######################### print(test_predict) test_predict = np.array([round(test_predict[i], 4) for i in range(test_predict.shape[0])]) print(test_predict) contents = load_result("withoutLabel_originalData.csv", dir_name = "resultData_All/test") features = np.array(contents[0]) sublime_features = np.array([features[0], "score"] ) save_result(sublime_features, "sublime_data.csv", dir_name = save_dir) data = np.array(contents[1:]) test_users = data[:, 0] test_users = test_users.reshape((test_users.size, 1)) test_predict = test_predict.reshape((test_predict.size, 1)) sublime_data = np.concatenate((test_users, test_predict), axis = 1) save_result(sublime_data, "sublime_data.csv", style = "a+", dir_name = save_dir)
def deleted_web_log_features(data, features, saved_dir = "result"): from create_new_features import find_featuers_index features_name = "WeblogInfo" fea_indexs = find_featuers_index(features_name, features) # print(fea_indexs) weblog_data = data[:, fea_indexs] weblog_features = features[fea_indexs] correlation_between_properties(weblog_data, weblog_features) delete_result = according_properties_correlation_delete() #save_result(data, file_name, features, style, dir_name) save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv") weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13", "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28", "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"] save_result(weblog_delete_needed, "deleted_useless_weblog.csv") delete_result.extend(weblog_delete_needed) data, features, deleted = delete_features(data, features, \ delete_feas_list = delete_result) print("Train delete(weblog) : ", deleted) return data, features
def correlation_between_properties(data, features): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) title = list() title.append("features1") title.append("features2") title.append("calculate_method") title.append("cor") title.append("pval") save_result(title, "pearsonr_spearmanr_results.csv") save_result(title, "pearsonr_spearmanr_Strong_correlation.csv") for fea_pos in range(len(features)): for fea_pos_add in range(fea_pos + 1, len(features)): info_result = list() info_result.append(features[fea_pos]) info_result.append(features[fea_pos_add]) a1 = data[:, fea_pos] a2 = data[:, fea_pos_add] # they are all not str style features if fea_pos not in indexs and fea_pos_add not in indexs: info_result.append("pearsonr") cor, pval = stats.pearsonr(a1, a2) else: # one of them or all of them are str style features info_result.append("spearmanr") cor, pval = stats.spearmanr(a1, a2) cor = round(cor, 3) info_result.append(cor) info_result.append(pval) if abs(cor) >= 0.2: save_result(info_result, "pearsonr_spearmanr_results.csv", style="a+") if abs(cor) >= 0.86: save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \ style = "a+")
def load_data_for_solve(file_name, for_train = True, is_round_two = False): label = "" if for_train: if is_round_two: SAVE_DIR = "resultData_two/" data_dir = "PPD-Second-Round-Data/" data_style = "Rematch Train/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name, data_dir, data_style) original_features, original_data, train_label = extract_target(original_features, original_data) label = train_label.copy() save_result(label, "train_label_original_round_two.csv", dir_name = SAVE_DIR) else: SAVE_DIR = "resultData/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name) original_features, original_data, train_label = extract_target(original_features, original_data) label = train_label.copy() save_result(label, "train_label_original.csv", dir_name = SAVE_DIR) else: if is_round_two: SAVE_DIR = "resultData_two/test" data_dir = "PPD-Second-Round-Data/" data_style = "Rematch Test/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name, data_dir, data_style) else: SAVE_DIR = "resultData/test/" # for testing or else, load the data from other place original_features, original_data, original = load_data(file_name, data_style = "Test Set") data = original_data.copy() features = original_features.copy() save_result(data, "withoutLabel_originalData.csv", features, dir_name = SAVE_DIR) return data, features, label
data, features = new_UserInfo_23_education_level(data, features) # save_result(data, "data_after_solved_UserInfo23.csv", features, dir_name = saved_dir) data, features = new_UserInfo_24_resident_level(data, features) # save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir) #data, features = new_UserInfo_22_23_combine1(data, features) # save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir) data, features = new_UserInfo_22_23_combine2(data, features) #save_result(data, "data_after_solved_user_info.csv", features, dir_name = saved_dir) return data, features # new_UserInfo_7_num if __name__ == '__main__': contents = load_result("withoutLabel_originalData.csv") features = np.array(contents[0]) data = np.array(contents[1:]) data, features = new_UserInfo_7_num(data, features) save_result(data, "test.csv", features) # deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_") # data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train) # data, features = solve_user_info_package(data, features) # from create_features_from_weblog import solve_weblog_info_package # data, features = solve_weblog_info_package(data, features)
def view_each_features_label(data, features, label): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) new_label = label.reshape((label.size, )) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] if fea_pos in str_features_index: file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str( fea_pos) + ")" + feature_name + ".png" else: file_path = "view_data_area/after_all/with_label_under_mean/" + str( fea_pos) + ")" + feature_name + ".png" features_info = feature_value_class(data, fea_pos, label, str_features_index) if features_info["num_of_value"] > 30: save_result([features[fea_pos]], "complex_value_features.csv", style="a+") else: if fea_pos not in str_features_index: save_result( [features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style="a+") y_positive = data[new_label == 1, fea_pos] y_negitive = data[new_label == 0, fea_pos] positive_index = np.array([ index for index in range(len(new_label)) if new_label[index] == 1 ]) negitive_index = np.array([ index for index in range(len(new_label)) if new_label[index] == 0 ]) plt.scatter(positive_index, y_positive, marker='o', color='r', s=10) plt.scatter(negitive_index, y_negitive, marker='x', color='g', s=10) plt.xlabel("instances(30000)") plt.ylabel("value") if features_info["num_of_value"] < 40: plt.title(feature_name + " value - label " + "distributed " + "in instances" + \ "\n the arrow --> Proportion of positive in that value & in positive") for k, v in features_info.items(): if isinstance(v, FeatureInData): arrow_data = round( v._respond_positive_num / features_info["num_positive"], 4) arrow_start_position_x = len(data) + 2000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) arrow_data = round( v._respond_positive_num / v._present_num, 4) arrow_start_position_x = -4000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) else: fea_average = round(np.mean(data[:, fea_pos]), 4) fea_std = np.std(data[:, fea_pos]) fea_oo = round(fea_std / fea_average, 4) max_v = np.amax(data[:, fea_pos]) min_v = np.amin(data[:, fea_pos]) plt.title(feature_name + " | mean & Proportion of positive under that mean" + \ "\n degree of fluctuation --> " + str(fea_oo)) x1 = np.array(range(-5000, 35000)) y_mean = fea_average * np.ones((x1.size)) #plt.plot(x1, y_mean, color = 'k', linestyle = "--") plt.annotate(fea_average, \ xy=(-4000,fea_average), \ xytext=(-4000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) under_mean_positive = 0 under_mean_num = 0 for k, v in features_info.items(): if isinstance(v, FeatureInData): if k <= fea_average: under_mean_num += v._present_num under_mean_positive += v._respond_positive_num ave_posi = round( under_mean_positive / features_info["num_positive"], 4) plt.annotate(ave_posi, \ xy=(31000,fea_average), \ xytext=(31000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) pos_rat = 0 pos_rat_whole = 0 if -1 in features_info.keys(): pos_rat = features_info[ -1]._respond_positive_num / features_info[-1]._present_num pos_rat_whole = features_info[ -1]._respond_positive_num / features_info["num_positive"] plt.annotate(round(pos_rat_whole, 4), \ xy=(31000,-1), \ xytext=(31000,-1)) plt.annotate(round(pos_rat, 4), \ xy=(-4000,-1), \ xytext=(-4000,-1)) plt.ylim(min_v - 10, fea_average * 2) #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10)) plt.savefig(file_path) plt.close()
def solve_user_info_package(data, features, saved_dir="resultData"): data, features = count_missed_create_new_feature(data, features, "UserInfo") # ################### solve the education info #################### data, features = new_EI_2(data, features) data, features = new_EI_4(data, features) data, features = new_EI_1_2_3_4(data, features) # save_result(data, "data_after_combine_EI1234.csv", features, dir_name = saved_dir) data, features = new_EI_6(data, features) data, features = new_EI_8(data, features) data, features = new_EI_5_6_7_8(data, features) # save_result(data, "data_after_combine_EI5678.csv", features, dir_name = saved_dir) data, features = new_UserInfo_miss_count(data, features) # save_result(data, "data_after_count_UserInfo_miss.csv", features, dir_name = saved_dir) data, features = new_UserInfo_2_level(data, features) # save_result(data, "data_after_solved_UserInfo2_level.csv", features, dir_name = saved_dir) data, features = new_UserInfo_7_level(data, features) # save_result(data, "data_after_solved_UserInfo7_level.csv", features, dir_name = saved_dir) key_features = ["UserInfo_2", "UserInfo_4"] feature_name = "UserInfo_2_4_wrong_correspond_city)" data, features = new_UserInfo_differ(data, features, key_features, feature_name) # save_result(data, "data_after_solved_UserInfo2_4.csv", features, dir_name = saved_dir) key_features = ["UserInfo_5", "UserInfo_6"] feature_name = "UserInfo_5_6_differ" data, features = new_UserInfo_differ(data, features, key_features, feature_name, deleted_all=False) # save_result(data, "data_after_solved_UserInfo5_6.csv", features, dir_name = saved_dir) data, features = new_UserInfo_789(data, features) # save_result(data, "data_after_solved_UserInfo789.csv", features, dir_name = saved_dir) data, features = new_UserInfo_11_12_13(data, features) key_features = ["UserInfo_11", "UserInfo_12", "UserInfo_13"] feature_name = "UserInfo_11_12_13_is_miss" data, features = new_UserInfo_differ(data, features, key_features, feature_name) # save_result(data, "data_after_solved_UserInfo11_12_13.csv", features, dir_name = saved_dir) key_features = ["UserInfo_14", "UserInfo_15"] feature_name = "UserInfo_14_15_differ" data, features = new_UserInfo_differ(data, features, key_features, feature_name, deleted_all=False) # save_result(data, "data_after_solved_UserInfo14_15.csv", features, dir_name = saved_dir) key_features = ["UserInfo_16", "UserInfo_17"] feature_name = "UserInfo_16_17_differ" data, features = new_UserInfo_differ(data, features, key_features, feature_name, deleted_all=False) # save_result(data, "data_after_solved_UserInfo16_17.csv", features, dir_name = saved_dir) data, features = new_UserInfo_18(data, features) # save_result(data, "data_after_solved_UserInfo18.csv", features, dir_name = saved_dir) data, features = new_UserInfo_19_20(data, features) # save_result(data, "data_after_solved_UserInfo19_20.csv", features, dir_name = saved_dir) data, features = new_UserInfo_22_marrage(data, features) # save_result(data, "data_after_solved_UserInfo22.csv", features, dir_name = saved_dir) data, features = new_UserInfo_23_education_level(data, features) # save_result(data, "data_after_solved_UserInfo23.csv", features, dir_name = saved_dir) data, features = new_UserInfo_24_resident_level(data, features) # save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir) #data, features = new_UserInfo_22_23_combine1(data, features) # save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir) data, features = new_UserInfo_22_23_combine2(data, features) save_result(data, "data_after_solved_user_info.csv", features, dir_name=saved_dir) return data, features
def strStyle_features_to_digit(data, features, for_train = True, use_experience = False, save_dir = "resultData/"): city_features = ["UserInfo_2", "UserInfo_4", "UserInfo_8", "UserInfo_20"] privince_features = ["UserInfo_7", "UserInfo_19"] phone_features = ["UserInfo_9"] marry_features = ["UserInfo_22"] resident_features = ["UserInfo_24"] # features_with_simply_value = ["WeblogInfo_2", "WeblogInfo_5", "WeblogInfo_8", # "UserInfo_10", "UserInfo_18", "WeblogInfo_24", # "WeblogInfo_27", "WeblogInfo_30"] # it is no need to map the other str style features, # only when the other features contain special characters # --> the list below contain all those features contain_special_features = ["UserInfo_23", "Education_Info2", "Education_Info3", \ "Education_Info4", "Education_Info6", "Education_Info7", \ "WeblogInfo_19", "WeblogInfo_20", "Education_Info8", \ "WeblogInfo_21", "ListingInfo"] digited_special_str_features = list() # digit city features digit_city_data = digit_city_features(data, features, city_features, use_original_features = True) #save_result(digit_city_data, "digited_city_data.csv", features) digited_special_str_features.extend(city_features) # digit province features digited_province_data = digit_province_features(digit_city_data, features, privince_features, \ use_original_features = True) #save_result(digited_province_data, "digited_province_data.csv", features) digited_special_str_features.extend(privince_features) # digit phone features digited_phone_data = digit_phone_features(digited_province_data, features, phone_features, \ use_original_features = True) #save_result(digited_phone_data, "digited_phone_data.csv", features) digited_special_str_features.extend(phone_features) # digit marrage features digited_marrage_data = digit_marry_features(digited_phone_data, features, marry_features, \ use_original_features = True) #save_result(digited_marrage_data, "digited_marrage_data.csv", features) digited_special_str_features.extend(marry_features) # digit resident features digited_residence_data = digit_resident_features(digited_marrage_data, features, resident_features, \ use_original_features = True) save_result(digited_residence_data, "data_when_digited_residence.csv", features) digited_special_str_features.extend(resident_features) digited_special_features_data = digited_residence_data if not for_train: use_experience = True save_dir = "resultData/test" # if this map is just or train, which means we do not have experience map style if not use_experience: digited_data, features_map_info = map_str_to_digit(digited_special_features_data, \ features, digited_special_str_features, \ contain_special_features) save_result(features_map_info, \ FEATURES_MAP_INFO_FILE_NAME, \ dir_name = "resultData/features_map") else: digited_data = map_str_to_digit_with_experience(digited_special_features_data, \ features, digited_special_str_features, \ contain_special_features) #digited_data = convert_to_numerical(convert_to_digit) save_result(digited_data, "data_after_Str_features_digited.csv", features, dir_name = save_dir) return digited_special_features_data
# @Author : chensijia ([email protected]) # @Version : 0.0.0 # @Style : Python3.5 # # @Description: from main_for_process_data import load_data_for_solve, replace_miss, strStyle_features_to_digit from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train from solve_data import delete_features import numpy as np if __name__ == '__main__': data, features, label = load_data_for_solve("PPD_Master_GBK_2_Test_Set.csv", for_train = False) data, features = replace_miss(data, features, label, for_train = False) #save_result(data, "test/data_after_filling_missing_.csv", features) deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_features_with_too_many_missing") data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train) save_result(data, "test_data_after_deleted_features.csv", features, dir_name = "resultData/test/") data = strStyle_features_to_digit(data, features, for_train = False, use_experience = True) save_result(data, "data_after_digited.csv", features, dir_name= "resultData/test/") save_features_info(data, features, label, "info_after_digit_all_features.csv", \ dir_name = "resultData/test/")
data, features, deleted = delete_features( data, features, delete_feas_list=deleted_features_in_train) print(deleted) data, features = solve_user_info_package(data, features, saved_dir=saved_area) data, features = solve_weblog_info_package(data, features, saved_dir=saved_area) data, features = solve_thirdparty_info_package(data, features, saved_dir=saved_area) data, features = extract_log_update_package(data, features, for_train) return data, features if __name__ == '__main__': # "resultData/test" data, features = pipeline_for_features_solved( for_train=False, saved_area="resultData/test/") print(data.shape) save_result(data, "data_after_features_processed.csv", features, dir_name="resultData/test")
contents = load_result("data_after_delete_no_discrimination_features.csv") features = np.array(contents[0]) data = np.array(contents[1:]) from map_features_to_digit import convert_to_numerical from solve_data import delete_features data = convert_to_numerical(data, features) data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) correlation_between_properties(data, features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_features_with_strong_correlation.csv") data, features, deleted_features = delete_features(data, features, \ delete_feas_list = delete_result) # print(deleted_features) save_result(data, "data_after_delete_strong_correlation_features.csv", features) print(data.shape) ###############3 used pca to delete ##################### # features_style = ["UserInfo", "WeblogInfo", "ThirdParty_Info_Period1", \ # "ThirdParty_Info_Period2", "ThirdParty_Info_Period3", \ # "ThirdParty_Info_Period4", "ThirdParty_Info_Period5", \ # "ThirdParty_Info_Period6"]
from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train from solve_data import delete_features import numpy as np if __name__ == '__main__': data, features, label = load_data_for_solve( "PPD_Master_GBK_2_Test_Set.csv", for_train=False) data, features = replace_miss(data, features, label, for_train=False) #save_result(data, "test/data_after_filling_missing_.csv", features) deleted_features_in_train = load_all_deleted_features_during_train( deleted_features_file_label="deleted_features_with_too_many_missing") data, features, deleted = delete_features( data, features, delete_feas_list=deleted_features_in_train) save_result(data, "test_data_after_deleted_features.csv", features, dir_name="resultData/test/") data = strStyle_features_to_digit(data, features, for_train=False, use_experience=True) save_result(data, "data_after_digited.csv", features, dir_name="resultData/test/") save_features_info(data, features, label, "info_after_digit_all_features.csv", \ dir_name = "resultData/test/")
contents = load_result("data_after_delete_no_discrimination_features.csv") features = np.array(contents[0]) data = np.array(contents[1:]) from map_features_to_digit import convert_to_numerical from solve_data import delete_features data = convert_to_numerical(data, features) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) correlation_between_properties(data, features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_features_with_strong_correlation.csv") data, features, deleted_features = delete_features(data, features, \ delete_feas_list = delete_result) # print(deleted_features) save_result(data, "data_after_delete_strong_correlation_features.csv", features) print(data.shape) ###############3 used pca to delete ##################### # features_style = ["UserInfo", "WeblogInfo", "ThirdParty_Info_Period1", \ # "ThirdParty_Info_Period2", "ThirdParty_Info_Period3", \ # "ThirdParty_Info_Period4", "ThirdParty_Info_Period5", \ # "ThirdParty_Info_Period6"] # #print(features)
weblog_data = data[:, fea_indexs] weblog_features = features[fea_indexs] print(weblog_data.shape) print(weblog_features.shape) #save_result(weblog_data, "weblog_data_view.csv", weblog_features) # # label_lines = np.array(load_result("train_label_original.csv")) # # #print(label_lines.shape) # # from save_load_result import convert_to_float # # label = convert_to_float(label_lines) # # label = label.reshape((label.size, )) correlation_between_properties(weblog_data, weblog_features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv") weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \ delete_feas_list = delete_result) save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv", weblog_features) weblog_delete_needed = [ "WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13", "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28", "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58" ] save_result(weblog_delete_needed, "deleted_useless_weblog.csv") new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \ delete_feas_list = weblog_delete_needed) save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)
print (grid.best_params_) print (grid.best_score_) return grid.best_params_, grid.best_score_ if __name__ == '__main__': contents = load_result("all_data_after_features_processed.csv", dir_name = "resultData_All") features = np.array(contents[0]) data = np.array(contents[1:]) print("data: ", data.shape) label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All")) print(label_lines.shape) from save_load_result import convert_to_int label = convert_to_int(label_lines) label = label.reshape((label.size, )) print("label: ", label.shape) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) # test_preds = module_xgboost_pre(train_data, train_label, test_data) # calculate_draw_roc(test_label, test_preds, save_fig_name = "module_xgb_ROC.png") auc_result, paras_result = grid_search_xgboost_params(data, label) print(auc_result) print(paras_result) save_result(auc_result, "grid_search_aucs.pickle", dir_name = "resultData_All") save_result(paras_result, "grid_search_paras.pickle", dir_name = "resultData_All")
def view_each_features_label(data, features, label): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) new_label = label.reshape((label.size,)) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] if fea_pos in str_features_index: file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(fea_pos) + ")" + feature_name + ".png" else: file_path = "view_data_area/after_all/with_label_under_mean/" + str(fea_pos) + ")" + feature_name + ".png" features_info = feature_value_class(data, fea_pos, label, str_features_index) if features_info["num_of_value"] > 30: save_result([features[fea_pos]], "complex_value_features.csv", style = "a+") else: if fea_pos not in str_features_index: save_result([features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style = "a+") y_positive = data[new_label == 1, fea_pos] y_negitive = data[new_label == 0, fea_pos] positive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 1]) negitive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 0]) plt.scatter(positive_index, y_positive, marker = 'o', color = 'r', s = 10) plt.scatter(negitive_index, y_negitive, marker = 'x', color = 'g', s = 10) plt.xlabel("instances(30000)") plt.ylabel("value") if features_info["num_of_value"] < 40: plt.title(feature_name + " value - label " + "distributed " + "in instances" + \ "\n the arrow --> Proportion of positive in that value & in positive") for k, v in features_info.items(): if isinstance(v, FeatureInData): arrow_data = round(v._respond_positive_num / features_info["num_positive"] , 4) arrow_start_position_x = len(data) + 2000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) arrow_data = round(v._respond_positive_num / v._present_num , 4) arrow_start_position_x = -4000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) else: fea_average = round(np.mean(data[:, fea_pos]), 4) fea_std = np.std(data[:, fea_pos]) fea_oo = round(fea_std / fea_average, 4) max_v = np.amax(data[:, fea_pos]) min_v = np.amin(data[:, fea_pos]) plt.title(feature_name + " | mean & Proportion of positive under that mean" + \ "\n degree of fluctuation --> " + str(fea_oo)) x1 = np.array(range(-5000, 35000)) y_mean = fea_average * np.ones((x1.size)) #plt.plot(x1, y_mean, color = 'k', linestyle = "--") plt.annotate(fea_average, \ xy=(-4000,fea_average), \ xytext=(-4000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) under_mean_positive = 0 under_mean_num = 0 for k, v in features_info.items(): if isinstance(v, FeatureInData): if k <= fea_average: under_mean_num += v._present_num under_mean_positive += v._respond_positive_num ave_posi = round(under_mean_positive / features_info["num_positive"], 4) plt.annotate(ave_posi, \ xy=(31000,fea_average), \ xytext=(31000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) pos_rat = 0 pos_rat_whole = 0 if -1 in features_info.keys(): pos_rat = features_info[-1]._respond_positive_num / features_info[-1]._present_num pos_rat_whole = features_info[-1]._respond_positive_num / features_info["num_positive"] plt.annotate(round(pos_rat_whole, 4), \ xy=(31000,-1), \ xytext=(31000,-1)) plt.annotate(round(pos_rat, 4), \ xy=(-4000,-1), \ xytext=(-4000,-1)) plt.ylim(min_v - 10, fea_average * 2) #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10)) plt.savefig(file_path) plt.close()
fea_indexs = find_featuers_index(features_name, features) print(fea_indexs) weblog_data = data[:, fea_indexs] weblog_features = features[fea_indexs] print(weblog_data.shape) print(weblog_features.shape) #save_result(weblog_data, "weblog_data_view.csv", weblog_features) # # label_lines = np.array(load_result("train_label_original.csv")) # # #print(label_lines.shape) # # from save_load_result import convert_to_float # # label = convert_to_float(label_lines) # # label = label.reshape((label.size, )) correlation_between_properties(weblog_data, weblog_features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv") weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \ delete_feas_list = delete_result) save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv", weblog_features) weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13", "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28", "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"] save_result(weblog_delete_needed, "deleted_useless_weblog.csv") new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \ delete_feas_list = weblog_delete_needed) save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)