def predict(self, X_test, time_remain): Xs = self.tables main_table, len_X_train = Xs[MAIN_TABLE_NAME], len(Xs[MAIN_TABLE_NAME]) main_table = pd.concat([main_table, X_test], keys=['train', 'test'], sort=True) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table # Xs[MAIN_TABLE_NAME] = clean_df(Xs[MAIN_TABLE_NAME]) clean_df(Xs[MAIN_TABLE_NAME]) X = merge_table(Xs, self.config) clean_df(X) selected_features = list( self.selected_features_0 ) + self.time_feature_list + self.mul_feature_list X = feature_engineer_rewrite(X.filter(selected_features), self.config) # X = X[X.index.str.startswith("test")] X = X.iloc[len_X_train:, ] X.sort_index(inplace=True) if FEATURE_SELECTION_SWITCH: X = X[self.selected_features_1] result = predict(X, self.config) del self.tables, X_test # gc.collect() return pd.Series(result)
def predict(self, X_test, time_remain): self.Time_data_info['time_ramain_so_far'] = time_remain start_feature = time.time() Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] log(f"Merge train and test tables...") main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log(f"Feature engineering...") clean_tables(Xs) X = merge_table(Xs, self.config) X = clean_df(X) X = feature_engineer(X, self.config) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) y_train = self.targets end_feature = time.time() self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature) self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - self.Time_data_info[ 'time_for_feature_engineering'] print(f"TIME info:", self.Time_data_info) # train model log(f"Training...") train_start = time.time() timetrain(X_train, y_train, self.config, self.Time_data_info) train_end = time.time() self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - (train_end - train_start) self.Time_data_info['time_for_model_train'] = (train_end - train_start) print("TIME info:", self.Time_data_info) # predict log(f"Predicting...") X_test = X[X.index.str.startswith("test")] X_test.index = X_test.index.map(lambda x: int(x.split('_')[1])) X_test.sort_index(inplace=True) result = predict(X_test, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): ##--------Calculate sample size---------- '''main_table=self.tables[MAIN_TABLE_NAME] print(main_table.shape[0]) print(X_test.shape[0]) return None''' Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table ## Clean tables clean_tables(Xs) #remove_trivial_features_in_tables(Xs) ## Merge tables and remove trivial features X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) ### ----------Temporarily remove multi-categorical features from related tables---------- X.drop([c for c in X.columns if c.startswith("mul_")], axis=1, inplace=True) #print(X.columns) #input() ### ----------End----------- remove_trivial_features(X) ## Add number frequency feature cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) ## Split train and test data X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) ## Training process train_with_time_control(X_train, self.y, self.config) ## Testing process result = predict(X, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.Xs from feature_for_test import baseline_features_test if self.one_hot_features is not None: X_test = baseline_features_test(Xs,X_test,self.config,self.m_features,self.mlbs,self.one_hot_models) else: X_test = baseline_features_test(Xs, X_test, self.config, [], None, None) result = predict(X_test, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): time_manager = TimeManager(self.config, time_remain) print(f"prediction remaining time: {time_remain}") print('', flush=True) Xs = self.tables # main_table, len_X_train = Xs[MAIN_TABLE_NAME], len(Xs[MAIN_TABLE_NAME]) # main_table = pd.concat([main_table, X_test], keys=['train', 'test'], sort=True) # time_manager.check("concat X_train and X_test") # main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") # Xs[MAIN_TABLE_NAME] = main_table Xs[MAIN_TABLE_NAME] = X_test clean_df(Xs[MAIN_TABLE_NAME]) time_manager.check("clean main table") X = merge_table(Xs, self.config) time_manager.check("merge table") clean_df(X) time_manager.check("clean data before learning") print('', flush=True) if FEATURE_SELECTION_SWITCH: selected_features = list(self.selected_features_0) + self.time_feature_list + self.mul_feature_list + self.num_feature_list else: selected_features = self.time_feature_list + self.mul_feature_list + self.num_feature_list X = feature_engineer_rewrite(X.filter(selected_features), self.config, time_manager) time_manager.check("exit feature engineering") print('', flush=True) # X = X[X.index.str.startswith("test")] # X = X.iloc[len_X_train:, ] X.sort_index(inplace=True) time_manager.check("X sorting") if FEATURE_SELECTION_SWITCH: test_data_feature_selection(X, self.selected_features_1) X = X[self.selected_features_1] time_manager.check("test data feature selection") print('', flush=True) result = predict(X, self.config) time_manager.check("prediction") print('', flush=True) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) result = None for i in range(0,3): train(X_train, self.y, self.config) tmp = predict(X, self.config) if result == None: result = tmp continue else: result = result + tmp result = result/float(3) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.Xs from feature_for_test import baseline_features_test, cat_value_counts, feature_selection_test X_test = baseline_features_test(Xs, X_test, self.config) features_from_base = feature_selection_test( X_test, self.feature_selection_models, int(len(X_test.columns) / 5)) X_test = cat_value_counts(X_test, self.cat_dict_counts) X_test.index = features_from_base.index X_test = pd.concat([X_test, features_from_base], axis=1) result = predict(X_test, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): timer = Timer() # -------- trace mem ---------------------- #tracemalloc.start(3) # -------- trace mem ---------------------- gc.collect() # ----- set mem for feature resume ------- MemoryManager.set_avl_sys_mem() # ----- set mem for feature resumme ------- #print(self.Xs[CONSTANT.MAIN_TABLE_NAME]['t_01'].min(), self.Xs[CONSTANT.MAIN_TABLE_NAME]['t_01'].max()) #print(X_test['t_01'].min(), X_test['t_01'].max()) X_test.reset_index(drop=True, inplace=True) if self.config['time_col'] in X_test.columns: X_test.sort_values(self.config['time_col'], inplace=True) index = X_test.index X_test.reset_index(drop=True, inplace=True) index = np.argsort(index) #print(f'X_test preprocess memory trace (now, peak): {tracemalloc.get_traced_memory()}') X_main = self.Xs[CONSTANT.MAIN_TABLE_NAME] # train main self.Xs[CONSTANT.MAIN_TABLE_NAME] = X_test del X_test self.convertX(self.Xs, self.config, False) self.clean_data(self.Xs, isTrain=False) #print(f'X_test convert memory trace (now, peak): {tracemalloc.get_traced_memory()}') train_len = X_main.shape[0] self.Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([X_main, self.Xs[CONSTANT.MAIN_TABLE_NAME]], axis=0).reset_index( drop=True) #print(f'concate memory trace (now, peak): {tracemalloc.get_traced_memory()}') del X_main gc.collect() #print(f'del memory trace (now, peak): {tracemalloc.get_traced_memory()}') X = self.feature_iter_Xs.feature_resume(self.Xs, self.y, isTrain=True) # 分表作一阶特征 #print(f'Xs feature memory trace (now, peak): {tracemalloc.get_traced_memory()}') Xs_name_list = [name for name in self.Xs.keys()] for name in Xs_name_list: del self.Xs[name] del self.Xs gc.collect() #print(f'del Xs memory trace (now, peak): {tracemalloc.get_traced_memory()}') # zcm修改,删除MC特征 self.prep_class.drop_mulcat_features(X) gc.collect() # end zcm修改,删除MC特征 CONSTANT.TRAIN_LEN = train_len self.feature_iter.feature_resume(X, self.y, isTrain=True) # 大表作所有特征 #print(f'X feature memory trace (now, peak): {tracemalloc.get_traced_memory()}') print(f'X mem after resume: {X.memory_usage().sum()}') self.prep_class.drop_features(X) gc.collect() #print(f'drop memory trace (now, peak): {tracemalloc.get_traced_memory()}') print(f'final X mem: {X.memory_usage().sum()}') train(X.iloc[0:train_len], self.y, self.config, timer) gc.collect() #print(f'train memory trace (now, peak): {tracemalloc.get_traced_memory()}') X_test = X.iloc[train_len:].reset_index(drop=True) del X gc.collect() result = predict(X_test, self.config) #print(f'predict memory trace (now, peak): {tracemalloc.get_traced_memory()}') result = result[index] return pd.Series(result)
overall_time_budget = overall_time_budget + time_budget time_spent = time.time() - start vprint( verbose, "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent)) if time_spent >= time_budget: vprint( verbose, "[-] Sorry, time budget exceeded, skipping this task") execution_success = False continue # ========= Creating a model, knowing its assigned task from D.info['task']. # The model can also select its hyper-parameters based on other elements of info. vprint( verbose, "======== Creating model ==========") import automl as mpd mpd.predict(D, output_dir, start, time_budget, basename, running_on_codalab ) ''' if zipme and overall_time_budget == 0: vprint( verbose, "========= Zipping this directory to prepare for submit ==============") data_io.zipdir(submission_filename + '.zip', ".") ''' overall_time_spent = time.time() - overall_start if execution_success: vprint( verbose, "[+] Done") vprint( verbose, "[+] Overall time spent %5.2f sec " % overall_time_spent + ":: Overall time budget %5.2f sec" % overall_time_budget) else: vprint( verbose, "[-] Done, but some tasks aborted because time limit exceeded") vprint( verbose, "[-] Overall time spent %5.2f sec " % overall_time_spent + " > Overall time budget %5.2f sec" % overall_time_budget) print "overall end", time.ctime() if running_on_codalab:
def predict(self, X_test, time_remain): timer = Timer() timer.set(time_remain) with timer.time_limit('ProProcess'): # fetch information of test dataset self.config[TEST_DATA_LENGTH] = len(X_test) self.config['test_time'] = self._fectch_time_range(X_test) self.config[STAGE] = 'test' Xs = self.tables main_table = pd.concat([Xs[MAIN_TABLE_NAME], X_test], axis=0, copy=False) main_table.reset_index(drop=True, inplace=True) del Xs[MAIN_TABLE_NAME] Xs[MAIN_TABLE_NAME] = main_table pre_process(Xs, self.config) clean_tables(Xs) pre_feature_extract(Xs) pre_tables_memory_cut(Xs) X = merge_table(Xs, self.config) # clean datas del self.tables, Xs gc.collect() self.null_count_sum(X, self.config) clean_df(X, fill_time=True) # compress data for memory problem X = table_memory_cut(X) # feature engineering print('overall X size', X.shape) X, add_feature = feature_engineer(X, self.config) # 内存问题 11G X = table_memory_cut(X) add_feature = table_memory_cut(add_feature) X = pd.concat([X, add_feature], axis=1, copy=False) del add_feature print(X.shape) # re compress data # 测试集分割 X_train_val, y_train_val = X.iloc[:self.config[ TRAIN_DATA_LENGTH]], self.train_label X_test = X.iloc[self.config[TRAIN_DATA_LENGTH]:] train_len = int(self.config[TRAIN_DATA_LENGTH] * 0.8) valid_len = self.config[TRAIN_DATA_LENGTH] - train_len self.config[TRAIN_LEN_OF_TRAIN_VAL] = train_len self.config[VAL_LEN_OF_TRAIN_VAL] = valid_len del X gc.collect() # 特征处理 all_label_count_feature_list = cat_Lable_Cnt_Fun( X_train_val, y_train_val, X_test, self.config) all_mutlicat_feature_data_list = Mv_Label_Cnt_Func( X_train_val, y_train_val, X_test, self.config) if (all_label_count_feature_list is None) & (all_mutlicat_feature_data_list is None): X_train, y_train = X_train_val.iloc[: train_len], self.train_label[: train_len] X_val, y_val = X_train_val.iloc[train_len:], self.train_label[ train_len:] else: all_feature_list = [] if all_label_count_feature_list is not None: all_feature_list += all_label_count_feature_list if all_mutlicat_feature_data_list is not None: all_feature_list += all_mutlicat_feature_data_list add_feature_data = pd.concat(all_feature_list, axis=1, copy=False) add_feature_data.sort_index(inplace=True) del all_label_count_feature_list, all_mutlicat_feature_data_list, all_feature_list gc.collect() X_train = pd.concat( [X_train_val[:train_len], add_feature_data[:train_len]], axis=1, copy=False) X_val = pd.concat([ X_train_val[train_len:self.config[TRAIN_DATA_LENGTH]], add_feature_data[train_len:self.config[TRAIN_DATA_LENGTH]] ], axis=1, copy=False) y_train = self.train_label[:train_len] y_val = self.train_label[train_len:] X_test = pd.concat([ X_test, add_feature_data[self.config[TRAIN_DATA_LENGTH]:] ], axis=1, copy=False) del X_train_val, y_train_val, add_feature_data, self.train_label gc.collect() train_columns = train(X_train, X_val, y_train, y_val, self.config, timer.remain) del X_train, X_val, y_train, y_val gc.collect() result = predict(X_test[train_columns], self.config) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table['y_sorted'] = self.y main_table.sort_values(self.ts_col, inplace=True) #y_trn = main_table.y_sorted.copy() #main_table.drop('y_sorted', axis=1, inplace=True) #main_table['data_type'] = 'train' #X_test['data_type'] = 'test' X_test['y_sorted'] = -1 main_table = pd.concat([main_table, X_test], ignore_index=True).reset_index() del X_test gc.collect() # main_table = pd.concat([main_table, X_test], keys=['train', 'test']) # main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log('memory usage of main_table: {:.2f}MB'.format( df_memory_usage(main_table) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) del Xs, main_table gc.collect() log('memory usage of X: {:.2f}MB'.format(df_memory_usage(X) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) self.cat_cols = sorted( [c for c in X.columns if c.startswith(CATEGORY_PREFIX)]) self.mcat_cols = sorted( [c for c in X.columns if c.startswith(MULTI_CAT_PREFIX)]) self.num_cols = sorted( [c for c in X.columns if c.startswith(NUMERICAL_PREFIX)]) self.ts_cols = sorted( [c for c in X.columns if c.startswith(TIME_PREFIX)]) X = self.feature_engineer(X, train=True) # X_trn = X[X.index.str.startswith("train")] # X_trn.index = X_trn.index.map(lambda x: int(x.split('_')[1])) X_trn = X[X['y_sorted'] != -1] y_trn = X_trn.y_sorted.copy() X_trn = X_trn.drop('y_sorted', axis=1) # X_tst = X[X.index.str.startswith("test")] # X_tst.index = X_tst.index.map(lambda x: int(x.split('_')[1])) X_tst = X[X['y_sorted'] == -1] X_tst = X_tst.drop('y_sorted', axis=1) X_tst.sort_index(inplace=True) del X gc.collect() log('memory usage of X_trn: {:.2f}MB'.format( df_memory_usage(X_trn) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) train(X_trn, y_trn, self.config) del X_trn, y_trn gc.collect() log('memory usage of X_tst: {:.2f}MB'.format( df_memory_usage(X_tst) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) result = predict(X_tst, self.config) del X_tst gc.collect() return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_time_index = main_table[["t_01"]].sort_values("t_01") # catLabel_dict = process_cat_label(main_table, self.lables.loc[main_table.index]) # modified By 05.30 main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs, self.config, fill=True) main_table = Xs[MAIN_TABLE_NAME] main_cat_cols = [ col for col in main_table.columns if (col.startswith("c_") or col.startswith("m_")) and len(main_table[col].unique()) > 1 ] total_num_fea = 0 catFea_dict, total_num_fea = process_main_cat( main_table, main_cat_cols, total_num_fea) # 专门利用主表提其他类别特征针对main的特征 print("total_num Fea:", total_num_fea) catFea_dicts = [] relation_catFea_dicts = [] relation_time_dicts = [] relation_catFea_dicts2 = [] if total_num_fea < 150: # 表示主表的衍生特征不够多,还可加 for relation in self.config['relations']: tableA = relation["table_A"] l_type = relation["type"].split("_")[0] tableB = relation["table_B"] r_type = relation["type"].split("_")[2] key = relation["key"][0] if tableA == "main" and l_type == "many" and r_type == "one": #and "t_01" not in Xs[tableB].columns: # 这里比较定制,后期需要改 ''' temp_main_cat = main_table[main_cat_cols] relation_num_cols = [col for col in Xs[tableB].columns if col.startswith("n_")] temp_tableB_num = Xs[tableB][[key]+relation_num_cols] temp_tableB_num = temp_tableB_num.set_index(key) temp_main_cat = temp_main_cat.join(temp_tableB_num, on=key) temp_dict, total_num_fea = process_main_cat_v2(temp_main_cat, main_cat_cols, key, tableB, total_num_fea) #main的类别,relation的numerical catFea_dicts.append(temp_dict) if total_num_fea > 150: break ''' Xs[tableB].drop_duplicates([key], inplace=True) relation_cat_cols = [ col for col in Xs[tableB].columns if (col.startswith("c_") or col.startswith("m_")) and len(Xs[tableB][col].unique()) > 1 ] temp_tableB_cat = Xs[tableB][relation_cat_cols] if key in main_table and key in temp_tableB_cat: temp_main_num = main_table[[key]] temp_tableB_cat = temp_tableB_cat.set_index(key) temp_main_num = temp_main_num.join(temp_tableB_cat, on=key) relation_temp_dict, total_num_fea = process_relation_cat( temp_main_num, relation_cat_cols, key, tableB, total_num_fea) #relation的类别,main的numerical #relation_catFea_dicts.append(relation_temp_dict) relation_catFea_dicts = relation_catFea_dicts + relation_temp_dict # if total_num_fea > 150: break ''' temp_tableB_cat = Xs[tableB][relation_cat_cols] relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_tableB_cat, relation_cat_cols, key, tableB, total_num_fea) relation_catFea_dicts2.append(relation_temp_dict2) ''' relation_time_cols = [ col for col in Xs[tableB].columns if col.startswith("t_") ] if len(relation_time_cols) > 0: if key in Xs[ tableB] and key in main_table and "t_01" in main_table: temp_tableB_time = Xs[tableB][[key] + relation_time_cols] temp_tableB_time.columns = [ col + "_in_" + tableB if col.startswith("t_") else col for col in temp_tableB_time.columns ] temp_main_time = main_table[[key] + ["t_01"]] temp_tableB_time = temp_tableB_time.set_index(key) temp_main_time = temp_main_time.join( temp_tableB_time, on=key) temp_main_time.drop(key, axis=1, inplace=True) #print("time_test v1") #print(temp_main_time.head()) temp_main_time = process_relation_time( temp_main_time) relation_time_dicts.append(temp_main_time) ''' temp_tableB = Xs[tableB].set_index(key) temp_main_key = main_table[[key]] temp_main_key = temp_main_key.join(temp_tableB, on=key) relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_main_key, relation_cat_cols, key, tableB, total_num_fea) del temp_main_key del temp_tableB relation_catFea_dicts2.append(relation_temp_dict2) if total_num_fea > 150: break ''' ''' #if len(relation_time_dicts) > 0: main_time_col=[col for col in main_table.columns if col.startswith("t_")] temp_main_time = main_table[main_time_col] for col in main_time_col: temp_main_time["n_weekday_" + col], temp_main_time["n_hour_" + col], temp_main_time["n_day_" + col]=zip(*temp_main_time[col].map(trans2basicInfo)) # temp_main_time["n_weekday_" + col] = temp_main_time[col].apply(trans2weekday) # temp_main_time["n_hour_" + col] = temp_main_time[col].apply(trans2hour) # temp_main_time["n_day_" + col] = temp_main_time[col].apply(trans2day) if not col.startswith("t_0"): temp_main_time["n_interval_" + col] = (temp_main_time[col] - temp_main_time["t_01"]).map(trans2interval) temp_main_time.drop(main_time_col, axis=1, inplace=True) relation_time_dicts.append(temp_main_time) print("Processing Trans to main time") ''' # Xs[MAIN_TABLE_NAME] = main_table # clean_tables(Xs, self.config, fill=True) merge_table_v2(Xs, self.config) #clean_tables(Xs) X = FT_process(Xs, self.config) del Xs del self.tables del main_table #print(X.shape) ''' for catLabel in catLabel_dict: # print(catLabel_dict[catLabel].head()) if catLabel in X.columns: X = X.join(catLabel_dict[catLabel], on=catLabel) ''' t1 = time.time() useful_catFea = [ catFea_dict[catFea] for catFea in catFea_dict if catFea in X.columns ] X = pd.concat([X] + useful_catFea, axis=1) print("processing process_main_cat") ''' for catFea in catFea_dict: if catFea in X.columns: #print(catFea_dict[catFea].head()) X = X.join(catFea_dict[catFea], on=catFea) print("processing process_main_cat") #print(X.head()) ''' del catFea_dict ''' for catFea_dict2 in catFea_dicts: for catFea in catFea_dict2: if catFea in X.columns: #print(catFea_dict2[catFea].head()) X = X.join(catFea_dict2[catFea], on=catFea) print("processing process_main_cat_v2") #print(X.head()) del catFea_dicts ''' ''' for relation_catFea_dict in relation_catFea_dicts: for relation_catFea in relation_catFea_dict: #print(relation_catFea_dict[relation_catFea].head()) if relation_catFea in X.columns: z=yield(relation_catFea_dict[relation_catFea]) # X = X.join(relation_catFea_dict[relation_catFea], on=relation_catFea) print("processing process_relation_cat") #print(X.head()) ''' X = pd.concat([X] + relation_catFea_dicts, axis=1) del relation_catFea_dicts if len(relation_time_dicts) > 0: X = pd.concat([X] + relation_time_dicts, axis=1) print("processing process_relation_time") #print(X.shape) #print(X.head()) del relation_time_dicts ''' for relation_catFea_dict2 in relation_catFea_dicts2: for relation_catFea in relation_catFea_dict2: #print(relation_catFea_dict2[relation_catFea].head()) if relation_catFea in X.columns: X = X.join(relation_catFea_dict2[relation_catFea], on=relation_catFea) print("processing process_relation_cat_v2") #print(X.head()) del relation_catFea_dicts2 ''' t2 = time.time() print("cat join cost time: ", t2 - t1) #print(X.head()) X.columns = [ "m_" + c if (".m_" in c) and ("MEAN" not in c) and ("SUM" not in c) and ("COUNT" not in c) and ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c for c in X.columns ] X.columns = [ "c_" + c if (".c_" in c) and ("MEAN" not in c) and ("SUM" not in c) and ("COUNT" not in c) and ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c for c in X.columns ] X.columns = [ "n_" + c if not c.startswith("n_") and not c.startswith("m_") and not c.startswith("c_") and not c.startswith("t_") else c for c in X.columns ] #print(X.columns) print("Column Number:", len(X.columns)) clean_df(X, "no_table", self.config) feature_engineer(X, self.config, len(X.columns), self.lables) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) #train(X_train, self.lables.loc[X_train.index], self.config) train(X_train.loc[main_time_index.index], self.lables.loc[main_time_index.index], self.config) # 按时间排序 del main_time_index X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) result = predict(X, self.config) return pd.Series(result)
vprint( verbose, "[+] Remaining time after reading data %5.2f sec" % (time_budget - time_spent)) if time_spent >= time_budget: vprint(verbose, "[-] Sorry, time budget exceeded, skipping this task") execution_success = False continue # ========= Creating a model, knowing its assigned task from D.info['task']. # The model can also select its hyper-parameters based on other elements of info. vprint(verbose, "======== Creating model ==========") import automl as mpd mpd.predict(D, output_dir, start, time_budget, basename, running_on_codalab) ''' if zipme and overall_time_budget == 0: vprint( verbose, "========= Zipping this directory to prepare for submit ==============") data_io.zipdir(submission_filename + '.zip', ".") ''' overall_time_spent = time.time() - overall_start if execution_success: vprint(verbose, "[+] Done") vprint( verbose, "[+] Overall time spent %5.2f sec " % overall_time_spent + ":: Overall time budget %5.2f sec" % overall_time_budget) else: vprint(verbose, "[-] Done, but some tasks aborted because time limit exceeded") vprint(
time_spent = time.time() - start vprint( verbose, "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent)) if time_spent >= time_budget: vprint( verbose, "[-] Sorry, time budget exceeded, skipping this task") execution_success = False continue # ========= Creating a model, knowing its assigned task from D.info['task']. # The model can also select its hyper-parameters based on other elements of info. vprint( verbose, "======== Creating model ==========") # djajetic, 2015 - original placeholder for code created by organizers is skipped (commented below) and replaced with file automl.py # code is leaned on "standard" organizer provided data loader and management import automl automl.predict(D, output_dir, basename ) ''' M = MyAutoML(D.info, verbose, debug_mode) print M # ========= Iterating over learning cycles and keeping track of time # Preferably use a method that iteratively improves the model and # regularly saves predictions results gradually getting better # until the time budget is exceeded. # The example model we provide we use just votes on an increasingly # large number of "base estimators". time_spent = time.time() - start vprint( verbose, "[+] Remaining time after building model %5.2f sec" % (time_budget-time_spent)) if time_spent >= time_budget: vprint( verbose, "[-] Sorry, time budget exceeded, skipping this task")
def predict(self, X_test, time_remain): self.Time_data_info['time_ramain_so_far'] = time_remain start_feature = time.time() Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #index = main_table.sort_values(by=self.config['time_col']).index #split = int(0.6*len(index)) #train_index, test_index = index[:split], index[split:] #log(f"Merge train and test tables...") main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log(f"Feature engineering...") clean_tables(Xs) X = merge_table(Xs, self.config) X = clean_df(X) X = feature_engineer(X, self.config) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) y_train = self.targets end_feature = time.time() self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature) self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - self.Time_data_info[ 'time_for_feature_engineering'] #self.Time_data_info['data_cols_for_hp'] = X.shape[1] #self.Time_data_info['data_rows_for_hp'] = X.shape[0] print(f"TIME info:", self.Time_data_info) # train model log(f"Training...") train_start = time.time() #train(X_train.iloc[train_index], y_train.iloc[train_index], self.config) timetrain(X_train, y_train, self.config, self.Time_data_info) #train with time limitation #timetrain(X_train.iloc[train_index], y_train.iloc[train_index], self.config, self.Time_data_info) train_end = time.time() self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - (train_end - train_start) self.Time_data_info['time_for_model_train'] = (train_end - train_start) print("TIME info:", self.Time_data_info) #r = predict(X_train.iloc[test_index], self.config) #r = timepredict(X_train.iloc[test_index], self.config) #print('Test auc: ', roc_auc_score(y_train.iloc[test_index], r)) #importance = self.config["model"].feature_importance(importance_type='split') #feature_name = np.array(self.config["model"].feature_name()) #feature_importance = pd.DataFrame({'feature_importance': feature_name[np.argsort(-importance)], 'importnace':-np.sort(-importance)}) #feature_importance.to_csv('feature_importance.csv', index=False) # predict log(f"Predicting...") X_test = X[X.index.str.startswith("test")] X_test.index = X_test.index.map(lambda x: int(x.split('_')[1])) X_test.sort_index(inplace=True) result = predict(X_test, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #.iloc[0:4000] #X_test = X_test#.iloc[0:4000] #self.y = self.y#.iloc[0:4000] if int(self.config["time_budget"]) > 2000: from data_sample import data_sample main_table, self.y = data_sample(main_table, self.y, ratio=1) # main_table = Xs[MAIN_TABLE_NAME].iloc[-1000000:] # self.y = self.y.iloc[-1000000:] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) ###-------------------- cat feature -----------------------### cat_features = [] for col in X.columns: if "ROLLING" not in col and "c_" in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) ###--------------------------------------------------------### ###------------------- data sample ------------------### if int(self.config["time_budget"]) <= 300: X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] from data_sample import data_sample X_train, self.y = data_sample(X_train, self.y, flag=True) X = pd.concat([X_train, X_test], keys=['train', 'test']) elif int(self.config["time_budget"]) < 2000: X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] from data_sample import data_sample X_train, self.y = data_sample(X_train, self.y) X = pd.concat([X_train, X_test], keys=['train', 'test']) #X.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") ###------------------- mul onehot feature -----------------### m_features = [] for col in X.columns: if ("ROLLING" not in col) and ("mul_feature_" in col): m_features.append(col) # if len(self.mlbs)>0 or self.mlbs is not None: # m_features = list(self.mlbs.keys()) # else: # m_features = [] one_hot_features = None one_hot_models = None mlbs = None one_hot_features_m = None from feature_expansion import onehot_feature_selection_m if len(m_features) > 0 and int(self.config["time_budget"]) > 100: one_hot_features_m, one_hot_models, mlbs = onehot_feature_selection_m( X, self.y, m_features, feature_num_everyiter=len(m_features), selection=True) X.drop(m_features, inplace=True, axis=1) elif len(m_features) > 0: X.drop(m_features, inplace=True, axis=1) ###-------------------------------------------------### ###------------------- onehot encoder ------------------### from feature_expansion import onehot_feature_selection one_hot_features = None if len(cat_features) > 0 and int(self.config["time_budget"]) > 4000: one_hot_features, one_hot_models, mlbs = onehot_feature_selection( X, self.y, cat_features, feature_num_everyiter=len(cat_features), selection=True) for cat_col in cat_features: if cat_col not in mlbs: X.drop(cat_col, inplace=True, axis=1) ###-----------------------concat--------------------### from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) if one_hot_features is not None: X = hstack([X, one_hot_features]).tocsr() if one_hot_features_m is not None: X = hstack([X, one_hot_features_m]).tocsr() ###-------------------------------------------------### # ###------------------drop mul_feature---------------### # m_features = [] # for feature in X.columns: # if "mul_feature_" in feature: # m_features.append(feature) # # X.drop(m_features,inplace=True,axis=1) # ###-------------------------------------------------### X_train = X[0:self.y.shape[0]] X = X[self.y.shape[0]:] result = None if int(self.config["time_budget"]) < 2000 and int( self.config["time_budget"]) > 300: for i in range(0, 3): train(X_train, self.y, self.config) tmp = predict(X, self.config) if result is None: result = tmp continue else: result = result + tmp result = result / float(3) else: train(X_train, self.y, self.config) result = predict(X, self.config) return pd.Series(result)