def predict(self, X: pd.DataFrame): """predict""" clean_table(X) feature_engineer(X) # log(f"Remain time: {self.pred_time_budget - (time.time() - start_time)}") prediction = self.model.predict(X) return pd.Series(prediction)
def predict(self, X_test, time_remain): ##--------Calculate sample size---------- '''main_table=self.tables[MAIN_TABLE_NAME] print(main_table.shape[0]) print(X_test.shape[0]) return None''' Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table ## Clean tables clean_tables(Xs) #remove_trivial_features_in_tables(Xs) ## Merge tables and remove trivial features X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) ### ----------Temporarily remove multi-categorical features from related tables---------- X.drop([c for c in X.columns if c.startswith("mul_")], axis=1, inplace=True) #print(X.columns) #input() ### ----------End----------- remove_trivial_features(X) ## Add number frequency feature cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) ## Split train and test data X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) ## Training process train_with_time_control(X_train, self.y, self.config) ## Testing process result = predict(X, self.config) return pd.Series(result)
def baseline_features_test(Xs,X_test,config,m_features,mlbs,one_hot_model): main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, config) clean_df(X) from feature_for_test import multi_features_for_test X = X[X.index.str.startswith("test")] feature_engineer(X, config) new_features=None if len(m_features)>0 and int(config["time_budget"])>300: new_features = multi_features_for_test(X, m_features, mlbs, one_hot_model) #new_features.index = X.index X.drop(m_features, inplace=True, axis=1) X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) X = hstack([X,new_features]).tocsr() print("------------------") print(X.shape) #X = pd.concat([X, new_features], axis=1) elif len(m_features)>0: X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) X.drop(m_features, inplace=True, axis=1) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) else: X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) return X
def predict(self, X: pd.DataFrame): """predict""" self.pred_func_start_time = time.time() print ('time spent on loading model and test data: %.2f' %(self.pred_func_start_time - self.pred_start_time)) self.training = False print ('test data size: %d' %(X.shape[0])) fillna(X) feature_engineer(X, self) remaining_time = self.pred_time_budget - (time.time() - self.pred_start_time) prediction = self.model.predict(X, pred_time_budget=self.pred_time_budget, remaining_time=remaining_time) return pd.Series(prediction)
def train(self, X: pd.DataFrame, y: pd.Series): """train model""" start_time = time.time() clean_table(X) feature_engineer(X) log(f"Remain time: {self.train_time_budget - (time.time() - start_time)}" ) if self.task == 'ssl': self.model = AutoSSLClassifier(self.train_time_budget) elif self.task == 'pu': self.model = AutoPUClassifier(self.train_time_budget) elif self.task == 'noisy': self.model = AutoNoisyClassifier(self.train_time_budget) self.model.fit(X, y)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) result = None for i in range(0,3): train(X_train, self.y, self.config) tmp = predict(X, self.config) if result == None: result = tmp continue else: result = result + tmp result = result/float(3) return pd.Series(result)
def predict(self, X_test, time_remain): self.Time_data_info['time_ramain_so_far'] = time_remain start_feature = time.time() Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] log(f"Merge train and test tables...") main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log(f"Feature engineering...") clean_tables(Xs) X = merge_table(Xs, self.config) X = clean_df(X) X = feature_engineer(X, self.config) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) y_train = self.targets end_feature = time.time() self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature) self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - self.Time_data_info[ 'time_for_feature_engineering'] print(f"TIME info:", self.Time_data_info) # train model log(f"Training...") train_start = time.time() timetrain(X_train, y_train, self.config, self.Time_data_info) train_end = time.time() self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - (train_end - train_start) self.Time_data_info['time_for_model_train'] = (train_end - train_start) print("TIME info:", self.Time_data_info) # predict log(f"Predicting...") X_test = X[X.index.str.startswith("test")] X_test.index = X_test.index.map(lambda x: int(x.split('_')[1])) X_test.sort_index(inplace=True) result = predict(X_test, self.config) return pd.Series(result)
def baseline_features(Xs, y, config): clean_tables(Xs) stampcol = Xs[CONSTANT.MAIN_TABLE_NAME][config["time_col"]].apply( lambda x: int(x.timestamp())) main_table = Xs[CONSTANT.MAIN_TABLE_NAME] main_table["label"] = y main_table["timestamp"] = stampcol main_table.sort_values("timestamp", inplace=True) tmp_columns = main_table.columns main_table = pd.DataFrame(main_table.values) main_table.columns = tmp_columns #main_table = main_table.iloc[0:40000] Xs[CONSTANT.MAIN_TABLE_NAME] = main_table y = main_table["label"] stampcol = main_table["timestamp"] X = merge_table(Xs, config) print(X.columns) X.drop(["label", "timestamp"], axis=1, inplace=True) clean_df(X) feature_engineer(X, config) cat_feature_map = {} for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_feature_map[col] = set(X[col]) feature_names = X.columns m_features = [] for feature in feature_names: if "mul_feature_" in feature: m_features.append(feature) one_hot_features = None one_hot_models = None mlbs = None if len(m_features) > 0 and int(config["time_budget"]) > 200000: one_hot_features, one_hot_models, mlbs = onehot_feature_selection_m( X, y, m_features, feature_num_everyiter=len(m_features)) X.drop(m_features, inplace=True, axis=1) #X = pd.concat([X, one_hot_features], axis=1) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) X = hstack([X, one_hot_features]).tocsr() elif len(m_features) > 0: X.drop(m_features, inplace=True, axis=1) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) else: from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) print("---------------------------------") print(X.shape) #X.drop(m_features,inplace=True,axis=1) # one_hot_features=None # one_hot_models = None # import random # X_tmp = [X] # y_tmp = [y] # for i in range(5): # cols = list(X.columns) # random.shuffle(cols) # cols_tmp = cols[0:int(len(cols)*0.5)] # X_tmp.append(X[cols_tmp]) # y_tmp.append(y) # # y = pd.concat(y_tmp,axis=0) # X = pd.concat(X_tmp, axis=0) return X, y, feature_names, cat_feature_map, stampcol, one_hot_features, one_hot_models, m_features, mlbs
def predict(self, X_test, time_remain): timer = Timer() timer.set(time_remain) with timer.time_limit('ProProcess'): # fetch information of test dataset self.config[TEST_DATA_LENGTH] = len(X_test) self.config['test_time'] = self._fectch_time_range(X_test) self.config[STAGE] = 'test' Xs = self.tables main_table = pd.concat([Xs[MAIN_TABLE_NAME], X_test], axis=0, copy=False) main_table.reset_index(drop=True, inplace=True) del Xs[MAIN_TABLE_NAME] Xs[MAIN_TABLE_NAME] = main_table pre_process(Xs, self.config) clean_tables(Xs) pre_feature_extract(Xs) pre_tables_memory_cut(Xs) X = merge_table(Xs, self.config) # clean datas del self.tables, Xs gc.collect() self.null_count_sum(X, self.config) clean_df(X, fill_time=True) # compress data for memory problem X = table_memory_cut(X) # feature engineering print('overall X size', X.shape) X, add_feature = feature_engineer(X, self.config) # 内存问题 11G X = table_memory_cut(X) add_feature = table_memory_cut(add_feature) X = pd.concat([X, add_feature], axis=1, copy=False) del add_feature print(X.shape) # re compress data # 测试集分割 X_train_val, y_train_val = X.iloc[:self.config[ TRAIN_DATA_LENGTH]], self.train_label X_test = X.iloc[self.config[TRAIN_DATA_LENGTH]:] train_len = int(self.config[TRAIN_DATA_LENGTH] * 0.8) valid_len = self.config[TRAIN_DATA_LENGTH] - train_len self.config[TRAIN_LEN_OF_TRAIN_VAL] = train_len self.config[VAL_LEN_OF_TRAIN_VAL] = valid_len del X gc.collect() # 特征处理 all_label_count_feature_list = cat_Lable_Cnt_Fun( X_train_val, y_train_val, X_test, self.config) all_mutlicat_feature_data_list = Mv_Label_Cnt_Func( X_train_val, y_train_val, X_test, self.config) if (all_label_count_feature_list is None) & (all_mutlicat_feature_data_list is None): X_train, y_train = X_train_val.iloc[: train_len], self.train_label[: train_len] X_val, y_val = X_train_val.iloc[train_len:], self.train_label[ train_len:] else: all_feature_list = [] if all_label_count_feature_list is not None: all_feature_list += all_label_count_feature_list if all_mutlicat_feature_data_list is not None: all_feature_list += all_mutlicat_feature_data_list add_feature_data = pd.concat(all_feature_list, axis=1, copy=False) add_feature_data.sort_index(inplace=True) del all_label_count_feature_list, all_mutlicat_feature_data_list, all_feature_list gc.collect() X_train = pd.concat( [X_train_val[:train_len], add_feature_data[:train_len]], axis=1, copy=False) X_val = pd.concat([ X_train_val[train_len:self.config[TRAIN_DATA_LENGTH]], add_feature_data[train_len:self.config[TRAIN_DATA_LENGTH]] ], axis=1, copy=False) y_train = self.train_label[:train_len] y_val = self.train_label[train_len:] X_test = pd.concat([ X_test, add_feature_data[self.config[TRAIN_DATA_LENGTH]:] ], axis=1, copy=False) del X_train_val, y_train_val, add_feature_data, self.train_label gc.collect() train_columns = train(X_train, X_val, y_train, y_val, self.config, timer.remain) del X_train, X_val, y_train, y_val gc.collect() result = predict(X_test[train_columns], self.config) return pd.Series(result)
def run_upu(): pos_per = [1, 5, 10, 20, 40] #pos_per = [1, 5, 10, 20, 40, 60, 80, 100] datasets = ["titanic", "ethn", "krvskp", "mushroom", "sat", "spambase", "texture", "twonorm"]#, "zhihu", "luoji", "myhug", "kaiyan", "nip", "yjp", "ttgwm"] # datasets = ["zhihu", "luoji", "myhug", "kaiyan", "nip", "yjp", "ttgwm"] for dataset in datasets: schema = "data/" + dataset + "/schmea" train_data_path = "data/" + dataset + "/train.data" train_label_path = "data/" + dataset + "/train.solution" test_data_path = "data/" + dataset + "/test.data" test_label_path = "data/" + dataset + "/test.solution" results = open("pr_results/" + dataset, "w") ########read_df############ train_data = read_df(train_data_path, schema) train_ground_truth = read_df(train_label_path, "") clean_df(train_data) feature_engineer(train_data) test_data = read_df(test_data_path, schema) test_label = read_df(test_label_path, "") test_label = test_label.loc[:, "label"] clean_df(test_data) feature_engineer(test_data) ########read_df############ test_data = test_data.values for per in pos_per: pos_index_path = "data/" + dataset + "/pos_percent" + str(per) + ".npy" print("positive perecnt " + str(per)) results.write("positive perecnt " + str(per) + ":\n") pos_idx = np.loadtxt(pos_index_path) pos_idx = pos_idx.astype(int) get_score = list() for i in range(len(pos_idx)): cur_idx = pos_idx[i] cur_train_data, cur_train_label = get_cur_label(train_data, train_ground_truth, cur_idx) cur_train_data = cur_train_data.values cur_train_label = cur_train_label.values # print (cur_train_data) # ================== # INSERT YOUR CODE HERE y_h, train_log = run(cur_train_data, cur_train_label, test_data, test_label) log_path = "pr_log/" + dataset + "_pos_percent" + str(per) + "_trial" + str(i) + ".txt" with open(log_path, 'w') as writer: tr = '[' + ','.join([str(x) for x in train_log['train_error_list']]) + ']' val = '[' + ','.join([str(x) for x in train_log['val_error_list']]) + ']' writer.write(tr) writer.write('\n\n\n') writer.write(val) y_h = [a.tolist()[1] for a in y_h] #print (type(y_h)) #print (y_h) y_h = np.array(y_h) #y_h = copu(cur_train_data, cur_train_label, test_data) # ================== if os.path.exists(test_label_path): score = validate(y_h, test_label_path) get_score.append(score) log(f"score = {score}") avg = np.mean(np.array(get_score)) std = np.mean(np.std(get_score)) results.write("avg: " + str(avg) + "\n") results.write("std: " + str(std) + "\n")
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_time_index = main_table[["t_01"]].sort_values("t_01") # catLabel_dict = process_cat_label(main_table, self.lables.loc[main_table.index]) # modified By 05.30 main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs, self.config, fill=True) main_table = Xs[MAIN_TABLE_NAME] main_cat_cols = [ col for col in main_table.columns if (col.startswith("c_") or col.startswith("m_")) and len(main_table[col].unique()) > 1 ] total_num_fea = 0 catFea_dict, total_num_fea = process_main_cat( main_table, main_cat_cols, total_num_fea) # 专门利用主表提其他类别特征针对main的特征 print("total_num Fea:", total_num_fea) catFea_dicts = [] relation_catFea_dicts = [] relation_time_dicts = [] relation_catFea_dicts2 = [] if total_num_fea < 150: # 表示主表的衍生特征不够多,还可加 for relation in self.config['relations']: tableA = relation["table_A"] l_type = relation["type"].split("_")[0] tableB = relation["table_B"] r_type = relation["type"].split("_")[2] key = relation["key"][0] if tableA == "main" and l_type == "many" and r_type == "one": #and "t_01" not in Xs[tableB].columns: # 这里比较定制,后期需要改 ''' temp_main_cat = main_table[main_cat_cols] relation_num_cols = [col for col in Xs[tableB].columns if col.startswith("n_")] temp_tableB_num = Xs[tableB][[key]+relation_num_cols] temp_tableB_num = temp_tableB_num.set_index(key) temp_main_cat = temp_main_cat.join(temp_tableB_num, on=key) temp_dict, total_num_fea = process_main_cat_v2(temp_main_cat, main_cat_cols, key, tableB, total_num_fea) #main的类别,relation的numerical catFea_dicts.append(temp_dict) if total_num_fea > 150: break ''' Xs[tableB].drop_duplicates([key], inplace=True) relation_cat_cols = [ col for col in Xs[tableB].columns if (col.startswith("c_") or col.startswith("m_")) and len(Xs[tableB][col].unique()) > 1 ] temp_tableB_cat = Xs[tableB][relation_cat_cols] if key in main_table and key in temp_tableB_cat: temp_main_num = main_table[[key]] temp_tableB_cat = temp_tableB_cat.set_index(key) temp_main_num = temp_main_num.join(temp_tableB_cat, on=key) relation_temp_dict, total_num_fea = process_relation_cat( temp_main_num, relation_cat_cols, key, tableB, total_num_fea) #relation的类别,main的numerical #relation_catFea_dicts.append(relation_temp_dict) relation_catFea_dicts = relation_catFea_dicts + relation_temp_dict # if total_num_fea > 150: break ''' temp_tableB_cat = Xs[tableB][relation_cat_cols] relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_tableB_cat, relation_cat_cols, key, tableB, total_num_fea) relation_catFea_dicts2.append(relation_temp_dict2) ''' relation_time_cols = [ col for col in Xs[tableB].columns if col.startswith("t_") ] if len(relation_time_cols) > 0: if key in Xs[ tableB] and key in main_table and "t_01" in main_table: temp_tableB_time = Xs[tableB][[key] + relation_time_cols] temp_tableB_time.columns = [ col + "_in_" + tableB if col.startswith("t_") else col for col in temp_tableB_time.columns ] temp_main_time = main_table[[key] + ["t_01"]] temp_tableB_time = temp_tableB_time.set_index(key) temp_main_time = temp_main_time.join( temp_tableB_time, on=key) temp_main_time.drop(key, axis=1, inplace=True) #print("time_test v1") #print(temp_main_time.head()) temp_main_time = process_relation_time( temp_main_time) relation_time_dicts.append(temp_main_time) ''' temp_tableB = Xs[tableB].set_index(key) temp_main_key = main_table[[key]] temp_main_key = temp_main_key.join(temp_tableB, on=key) relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_main_key, relation_cat_cols, key, tableB, total_num_fea) del temp_main_key del temp_tableB relation_catFea_dicts2.append(relation_temp_dict2) if total_num_fea > 150: break ''' ''' #if len(relation_time_dicts) > 0: main_time_col=[col for col in main_table.columns if col.startswith("t_")] temp_main_time = main_table[main_time_col] for col in main_time_col: temp_main_time["n_weekday_" + col], temp_main_time["n_hour_" + col], temp_main_time["n_day_" + col]=zip(*temp_main_time[col].map(trans2basicInfo)) # temp_main_time["n_weekday_" + col] = temp_main_time[col].apply(trans2weekday) # temp_main_time["n_hour_" + col] = temp_main_time[col].apply(trans2hour) # temp_main_time["n_day_" + col] = temp_main_time[col].apply(trans2day) if not col.startswith("t_0"): temp_main_time["n_interval_" + col] = (temp_main_time[col] - temp_main_time["t_01"]).map(trans2interval) temp_main_time.drop(main_time_col, axis=1, inplace=True) relation_time_dicts.append(temp_main_time) print("Processing Trans to main time") ''' # Xs[MAIN_TABLE_NAME] = main_table # clean_tables(Xs, self.config, fill=True) merge_table_v2(Xs, self.config) #clean_tables(Xs) X = FT_process(Xs, self.config) del Xs del self.tables del main_table #print(X.shape) ''' for catLabel in catLabel_dict: # print(catLabel_dict[catLabel].head()) if catLabel in X.columns: X = X.join(catLabel_dict[catLabel], on=catLabel) ''' t1 = time.time() useful_catFea = [ catFea_dict[catFea] for catFea in catFea_dict if catFea in X.columns ] X = pd.concat([X] + useful_catFea, axis=1) print("processing process_main_cat") ''' for catFea in catFea_dict: if catFea in X.columns: #print(catFea_dict[catFea].head()) X = X.join(catFea_dict[catFea], on=catFea) print("processing process_main_cat") #print(X.head()) ''' del catFea_dict ''' for catFea_dict2 in catFea_dicts: for catFea in catFea_dict2: if catFea in X.columns: #print(catFea_dict2[catFea].head()) X = X.join(catFea_dict2[catFea], on=catFea) print("processing process_main_cat_v2") #print(X.head()) del catFea_dicts ''' ''' for relation_catFea_dict in relation_catFea_dicts: for relation_catFea in relation_catFea_dict: #print(relation_catFea_dict[relation_catFea].head()) if relation_catFea in X.columns: z=yield(relation_catFea_dict[relation_catFea]) # X = X.join(relation_catFea_dict[relation_catFea], on=relation_catFea) print("processing process_relation_cat") #print(X.head()) ''' X = pd.concat([X] + relation_catFea_dicts, axis=1) del relation_catFea_dicts if len(relation_time_dicts) > 0: X = pd.concat([X] + relation_time_dicts, axis=1) print("processing process_relation_time") #print(X.shape) #print(X.head()) del relation_time_dicts ''' for relation_catFea_dict2 in relation_catFea_dicts2: for relation_catFea in relation_catFea_dict2: #print(relation_catFea_dict2[relation_catFea].head()) if relation_catFea in X.columns: X = X.join(relation_catFea_dict2[relation_catFea], on=relation_catFea) print("processing process_relation_cat_v2") #print(X.head()) del relation_catFea_dicts2 ''' t2 = time.time() print("cat join cost time: ", t2 - t1) #print(X.head()) X.columns = [ "m_" + c if (".m_" in c) and ("MEAN" not in c) and ("SUM" not in c) and ("COUNT" not in c) and ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c for c in X.columns ] X.columns = [ "c_" + c if (".c_" in c) and ("MEAN" not in c) and ("SUM" not in c) and ("COUNT" not in c) and ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c for c in X.columns ] X.columns = [ "n_" + c if not c.startswith("n_") and not c.startswith("m_") and not c.startswith("c_") and not c.startswith("t_") else c for c in X.columns ] #print(X.columns) print("Column Number:", len(X.columns)) clean_df(X, "no_table", self.config) feature_engineer(X, self.config, len(X.columns), self.lables) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) #train(X_train, self.lables.loc[X_train.index], self.config) train(X_train.loc[main_time_index.index], self.lables.loc[main_time_index.index], self.config) # 按时间排序 del main_time_index X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) result = predict(X, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): self.Time_data_info['time_ramain_so_far'] = time_remain start_feature = time.time() Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #index = main_table.sort_values(by=self.config['time_col']).index #split = int(0.6*len(index)) #train_index, test_index = index[:split], index[split:] #log(f"Merge train and test tables...") main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log(f"Feature engineering...") clean_tables(Xs) X = merge_table(Xs, self.config) X = clean_df(X) X = feature_engineer(X, self.config) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) y_train = self.targets end_feature = time.time() self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature) self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - self.Time_data_info[ 'time_for_feature_engineering'] #self.Time_data_info['data_cols_for_hp'] = X.shape[1] #self.Time_data_info['data_rows_for_hp'] = X.shape[0] print(f"TIME info:", self.Time_data_info) # train model log(f"Training...") train_start = time.time() #train(X_train.iloc[train_index], y_train.iloc[train_index], self.config) timetrain(X_train, y_train, self.config, self.Time_data_info) #train with time limitation #timetrain(X_train.iloc[train_index], y_train.iloc[train_index], self.config, self.Time_data_info) train_end = time.time() self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - (train_end - train_start) self.Time_data_info['time_for_model_train'] = (train_end - train_start) print("TIME info:", self.Time_data_info) #r = predict(X_train.iloc[test_index], self.config) #r = timepredict(X_train.iloc[test_index], self.config) #print('Test auc: ', roc_auc_score(y_train.iloc[test_index], r)) #importance = self.config["model"].feature_importance(importance_type='split') #feature_name = np.array(self.config["model"].feature_name()) #feature_importance = pd.DataFrame({'feature_importance': feature_name[np.argsort(-importance)], 'importnace':-np.sort(-importance)}) #feature_importance.to_csv('feature_importance.csv', index=False) # predict log(f"Predicting...") X_test = X[X.index.str.startswith("test")] X_test.index = X_test.index.map(lambda x: int(x.split('_')[1])) X_test.sort_index(inplace=True) result = predict(X_test, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #.iloc[0:4000] #X_test = X_test#.iloc[0:4000] #self.y = self.y#.iloc[0:4000] if int(self.config["time_budget"]) > 2000: from data_sample import data_sample main_table, self.y = data_sample(main_table, self.y, ratio=1) # main_table = Xs[MAIN_TABLE_NAME].iloc[-1000000:] # self.y = self.y.iloc[-1000000:] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) ###-------------------- cat feature -----------------------### cat_features = [] for col in X.columns: if "ROLLING" not in col and "c_" in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) ###--------------------------------------------------------### ###------------------- data sample ------------------### if int(self.config["time_budget"]) <= 300: X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] from data_sample import data_sample X_train, self.y = data_sample(X_train, self.y, flag=True) X = pd.concat([X_train, X_test], keys=['train', 'test']) elif int(self.config["time_budget"]) < 2000: X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] from data_sample import data_sample X_train, self.y = data_sample(X_train, self.y) X = pd.concat([X_train, X_test], keys=['train', 'test']) #X.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") ###------------------- mul onehot feature -----------------### m_features = [] for col in X.columns: if ("ROLLING" not in col) and ("mul_feature_" in col): m_features.append(col) # if len(self.mlbs)>0 or self.mlbs is not None: # m_features = list(self.mlbs.keys()) # else: # m_features = [] one_hot_features = None one_hot_models = None mlbs = None one_hot_features_m = None from feature_expansion import onehot_feature_selection_m if len(m_features) > 0 and int(self.config["time_budget"]) > 100: one_hot_features_m, one_hot_models, mlbs = onehot_feature_selection_m( X, self.y, m_features, feature_num_everyiter=len(m_features), selection=True) X.drop(m_features, inplace=True, axis=1) elif len(m_features) > 0: X.drop(m_features, inplace=True, axis=1) ###-------------------------------------------------### ###------------------- onehot encoder ------------------### from feature_expansion import onehot_feature_selection one_hot_features = None if len(cat_features) > 0 and int(self.config["time_budget"]) > 4000: one_hot_features, one_hot_models, mlbs = onehot_feature_selection( X, self.y, cat_features, feature_num_everyiter=len(cat_features), selection=True) for cat_col in cat_features: if cat_col not in mlbs: X.drop(cat_col, inplace=True, axis=1) ###-----------------------concat--------------------### from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) if one_hot_features is not None: X = hstack([X, one_hot_features]).tocsr() if one_hot_features_m is not None: X = hstack([X, one_hot_features_m]).tocsr() ###-------------------------------------------------### # ###------------------drop mul_feature---------------### # m_features = [] # for feature in X.columns: # if "mul_feature_" in feature: # m_features.append(feature) # # X.drop(m_features,inplace=True,axis=1) # ###-------------------------------------------------### X_train = X[0:self.y.shape[0]] X = X[self.y.shape[0]:] result = None if int(self.config["time_budget"]) < 2000 and int( self.config["time_budget"]) > 300: for i in range(0, 3): train(X_train, self.y, self.config) tmp = predict(X, self.config) if result is None: result = tmp continue else: result = result + tmp result = result / float(3) else: train(X_train, self.y, self.config) result = predict(X, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): ### calculate time range '''Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] print(main_table.columns) input() min_train_time = np.min(main_table[[c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) max_train_time = np.max(main_table[[c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) min_test_time = np.min(X_test[[c for c in X_test.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) max_test_time = np.max(X_test[[c for c in X_test.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) print("minimum time in training dataset %s"%str(min_train_time)) print("maximum time in training dataset %s"%str(max_train_time)) print("minimum time in testing dataset %s"%str(min_test_time)) print("maximum time in testing dataset %s"%str(max_test_time)) return None''' ### test concept drift '''Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #main_table = pd.concat([main_table, X_test], keys=['train', 'test']) #main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") main_table = pd.concat([main_table, self.y], axis=1) time_feature = [c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)] main_table = main_table.sort_values(time_feature) number_test = int(main_table.shape[0]*0.2) X_test = main_table.tail(number_test) X_test.index = range(X_test.shape[0]) main_table = main_table.head(main_table.shape[0] - number_test) main_table.index = range(main_table.shape[0]) min_train_time = np.min(main_table[time_feature]) max_train_time = np.max(main_table[time_feature]) min_test_time = np.min(X_test[time_feature]) max_test_time = np.max(X_test[time_feature]) print("minimum time in training dataset %s"%str(min_train_time)) print("maximum time in training dataset %s"%str(max_train_time)) print("minimum time in testing dataset %s"%str(min_test_time)) print("maximum time in testing dataset %s"%str(max_test_time)) y_test = X_test[X_test.columns[-1]] X_test = X_test[X_test.columns[0:-1]] y_train = main_table[main_table.columns[-1]] main_table = main_table[main_table.columns[0:-1]] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] train(X_train, y_train, self.config) result = predict(X_test, self.config) fpr, tpr, thresholds=metrics.roc_curve(y_test.values, result, pos_label=1) print("test auc is %.4f"%(metrics.auc(fpr, tpr))) return None''' Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) diff = X.max() - X.min() threshold = 1e-6 X = X[X.columns[diff > threshold]] print("There are %d columns of trivial features" % (diff.shape[0] - X.shape[1])) '''cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col)''' #X, _ = cat_value_counts(X, cat_features) #X = pd.get_dummies(X, columns = X.columns, sparse=True) #cumulative_shift, X = oneHotEncoding(X) #self.config["cumulative_shift"] = cumulative_shift X_train, X, one_hot_features, all_features = oneHotEncodingCSRMatrix(X) #cumulative_shift = X.shape[1] self.config["cumulative_shift"] = all_features y = self.y.values result = None #X_train = X[X.index.str.startswith("train")] #train(X_train, y, self.config) #X = X[X.index.str.startswith("test")] #X.index = X.index.map(lambda x: int(x.split('_')[1])) #X.sort_index(inplace=True) #result = predict(X, self.config) #result = train_fm_keras(X_train, X, y, self.config, one_hot_features) #input() result = train_fm_keras_batch(X_train, X, y, self.config, one_hot_features) #result = train_and_predict(X_train, y, X, self.config, one_hot_features) '''tf.reset_default_graph() from tensorflow.python.summary.writer import writer_cache #print(writer_cache.FileWriterCache.get('./models/eval')) writer_cache.FileWriterCache.clear() input() os.system("rm -r ./models/*")''' '''os.system("rm -r ./models/model.*") os.system("rm -r ./models/check*") os.system("rm -r ./models/graph.*") os.system("rm -r ./models/eval/*")''' return pd.Series(result)
def predict(self, X_test, time_remain): time_1 = time.time() Xs = self.tables main_table_tmp = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table_tmp, X_test], keys=['train', 'test']) # main_table = pd.concat([X_test], keys=['test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table del main_table_tmp del X_test gc.collect() clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) del Xs gc.collect() ##############################################################################3 ##############################################################################3 print( "########################################################################\n" "# select feature #\n" "########################################################################\n" ) X_to_select = X[X.index.str.startswith("train")] big_df_memory = X_to_select.memory_usage().sum() big_df_len = X_to_select.shape[0] sample_num = int(len(self.y) / 10) part_X, part_y = data_sample_new(X_to_select, self.y, sample_num) del X_to_select # del y gc.collect() part_X = part_X.reset_index(drop=True) part_y = part_y.reset_index(drop=True) tmp_part_X, \ self.two_order_cols, \ self.two_group_cols, \ self.mv_encs, \ self.c_one_order_cols, \ self.c_two_order_cols, \ self.c_two_order_group_cnt_cols, \ self.c_two_order_n_groupby_cat_cols, \ self.n_minus_mean_cols, \ max_numb_cols_to_select, \ fe_model \ = feature_engineer(part_X, self.config, part_y, two_order_cols=self.two_order_cols, two_group_cols=self.two_group_cols, mv_encs=self.mv_encs, c_one_order_cols=self.c_one_order_cols, c_two_order_cols=self.c_two_order_cols, c_two_order_group_cnt_cols=self.c_two_order_group_cnt_cols, c_two_order_n_groupby_cat_cols=self.c_two_order_n_groupby_cat_cols, n_minus_mean_cols=self.n_minus_mean_cols, cols_selected=self.cols_selected, big_df_memory=big_df_memory, big_df_len=big_df_len, fe_model=None ) tmp_part_X_d, self.cols_selected = feature_selector( tmp_part_X, part_y, max_numb_cols_to_select=max_numb_cols_to_select) # print("#" * 50) # print(part_X.memory_usage()) # # print(tmp_part_X_d.memory_usage()) # # part_X_mem_use_b = part_X.memory_usage().sum() # tmp_part_X_mem_use_b = tmp_part_X.memory_usage().sum() # tmp_part_X_d_mem_use_b = tmp_part_X_d.memory_usage().sum() # print(part_X_mem_use_b) # print(tmp_part_X_d_mem_use_b) # print(tmp_part_X_d_mem_use_b / part_X_mem_use_b) # print(tmp_part_X_mem_use_b / part_X_mem_use_b) # # part_X_mem_use_g = part_X.memory_usage().sum() / (1024 ** 3) # tmp_part_X__d_mem_use_g = tmp_part_X_d.memory_usage().sum() / (1024 ** 3) # print(part_X_mem_use_g) # print(tmp_part_X__d_mem_use_g) # print(tmp_part_X__d_mem_use_g / part_X_mem_use_g) # print("#" * 50) self.mv_encs = None del tmp_part_X del tmp_part_X_d del part_X del part_y gc.collect() print( "########################################################################\n" "# after select feature use all of data to train #\n" "########################################################################\n" ) ##############################################################################3 ##############################################################################3 X, \ self.two_order_cols, \ self.two_group_cols, \ self.mv_encs, \ self.c_one_order_cols, \ self.c_two_order_cols, \ self.c_two_order_group_cnt_cols, \ self.c_two_order_n_groupby_cat_cols, \ self.n_minus_mean_cols, \ max_numb_cols_to_select, \ fe_model \ = feature_engineer(X, self.config, two_order_cols=self.two_order_cols, two_group_cols=self.two_group_cols, mv_encs=self.mv_encs, c_one_order_cols=self.c_one_order_cols, c_two_order_cols=self.c_two_order_cols, c_two_order_group_cnt_cols=self.c_two_order_group_cnt_cols, c_two_order_n_groupby_cat_cols=self.c_two_order_n_groupby_cat_cols, n_minus_mean_cols=self.n_minus_mean_cols, cols_selected=self.cols_selected, fe_model=fe_model ) X = X[self.cols_selected] print(X.columns.tolist()) print(self.cols_selected) X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] gc.collect() X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) gc.collect() time_2 = time.time() time_left_to_train = time_remain - (time_2 - time_1) tmp_time = time_left_to_train run_flag = True a_time = 0 train_count = 0 train_num = 0 run_num = 1 # while run_flag: change_flag = True print(tmp_time) while run_num > 0: for i in range(1): t_1 = time.time() part_X, part_y = data_sample_for_train(X_train, self.y) print("*" * 10) print(len(part_y)) print("*" * 10) train(part_X, part_y, self.config) t_2 = time.time() a_time = t_2 - t_1 time_left_to_train = time_left_to_train - a_time print('a_time: ', a_time, 'time_left_to_train: ', time_left_to_train) if tmp_time / a_time > 60: if change_flag: run_num = 25 print("###25###") elif tmp_time / a_time < 5 and tmp_time > 3 * a_time: if change_flag: run_num = 2 print("###2###") elif time_left_to_train <= 3 * a_time: run_num = 0 print("###stop###") elif time_left_to_train < 50: run_num = 0 print("###stop###") else: if change_flag: run_num = 3 print("###3###") change_flag = False run_num = run_num - 1 # if a_time * 5 + 30 >= time_left_to_train: # run_flag = False # train_count = train_count + 1 # if train_count > 25: # run_flag = False # if train_count < 4: # run_flag = True # if time_left_to_train / a_time < 3: # run_flag = False # train(X_train, self.y, self.config) gc.collect() del X_train gc.collect() # X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) gc.collect() result = predict(X, self.config) return pd.Series(result)