def test_input_load(feat_path, label_path): inputs, targets = read_feature(feat_path, label_path) if args.Backbone_model == 'baseLSTM' or args.Backbone_model == 'CLDNN': train_DB = read_DB_structure(os.path.join(c.MFB_DIR + '_' + str(args.padding_time), 'train_folder'), 'train') MS_path = os.path.join(c.MFB_DIR + '_' + str(args.padding_time), 'Train_Mean_Var') elif args.Backbone_model == '2DCRNN': train_DB = read_DB_structure(os.path.join(c.STFT_DIR + '_1.0', 'train_folder'), 'train') MS_path = os.path.join(c.STFT_DIR + '_1.0', 'Train_Mean_Var') if c.USE_GLOBAL_NORM: mean_path = os.path.join(MS_path, 'train_mean.txt') std_path = os.path.join(MS_path, 'train_std.txt') train_mean, train_std = calc_global_mean_std(mean_path, std_path, train_DB) inputs = global_feature_normalize(inputs, train_mean, train_std) TI = LSTMInputTest() TT = ToTensorInput() inputs, targets = TI(inputs, targets) inputs, targets = TT(inputs, targets) with torch.no_grad(): inputs = Variable(inputs) targets = Variable(targets) return inputs, targets
def Aurora_EPD(DB, feature_directory, label_directory): num_of_DB = len(DB) for i in range(num_of_DB): feat_path = DB['filename'][i] output_file = feat_path.split('/')[-1] output_file_path = os.path.join(feature_directory, output_file) label_path = DB['label_path'][i] output_label = label_path.split('/')[-1] output_label = output_label.split('.')[0] + '.pkl' output_label_path = os.path.join(label_directory, output_label) feature, label = read_feature(feat_path, label_path) # Getting start and end points in utterance start_point = np.where(label == 1)[0][0] end_point = np.where(label == 1)[0][-1] epd_feature = feature[start_point:end_point] epd_label = label[start_point:end_point] if os.path.isfile(output_file_path): print('\'' + output_file + '\'' + 'feature already extracted!') else: with open(output_file_path, 'wb') as fp: pickle.dump(epd_feature, fp) print('[EPD] Feature : %s is done!' % output_file) if os.path.isfile(output_label_path): print('\'' + output_label + '\'' + 'label already extracted!') else: with open(output_label_path, 'wb') as fp: pickle.dump(epd_label, fp) print('[EPD] Label : %s is done!' % output_label)
def get_onsets(track_id): """Read onset times and intervals from file onsets_dir + track_id + '.csv'. File should contain a time column followed by one column of inter-onset intervals. """ onsets_file = os.path.join(onsets_dir, str(track_id) + '.csv') t, ioi = utils.read_feature(onsets_file, time=True) return t, ioi
def get_beats(track_id): """Read beat times and intervals from file beats_dir + track_id + '.csv'. File should contain a time column followed by one column of beat intervals. """ beats_file = os.path.join(beats_dir, str(track_id) + '.csv') t, beat_intervals = utils.read_feature(beats_file, time=True) return t, beat_intervals
def get_chroma(track_id): """Read chroma data from file chroma_dir + track_id + '.csv'. File should contain a time column followed by one column per chroma dimension. """ chroma_file = os.path.join(chroma_dir, track_id + '.csv') t, chroma = utils.read_feature(chroma_file, time=True) return t, chroma
def get_melody(track_id): """Read melody data from file melody_dir + track_id + '.csv'. File should contain melody data in two columns: (time, melody) with melody in midi note number (float or int). Frames in which no pitch is present can be set to 0, None or np.nan. """ melody_file = os.path.join(melody_dir, track_id + '.csv') t, melody = utils.read_feature(melody_file, time=True) return t, melody.flatten()
def compute(segment_dict, features): """ Args: segment_dict (dict): dictionary of song segments, containing a list of segment ids (values) for a set of unique song identifiers (keys). """ data_dict = {} # compute features for feature in features: print('computing ' + feature + '...') feature_name, first_order_aggregates, second_order_aggregates = parse_feature( feature) corpus_features = [] for song_id in segment_dict.keys(): song_features = [] for segment in segment_dict[song_id]: raw_features = utils.read_feature( [data_dir, feature_name, segment], skip_cols='auto') segment_features = first_order(raw_features, first_order_aggregates, verbose=False) song_features.append(segment_features) if 'song' in second_order_aggregates: song_features = second_order(song_features, second_order_aggregates, verbose=False) corpus_features.extend(song_features) if 'corpus' in second_order_aggregates: # print(' in: len(corpus_features) = {}, corpus_features[0] = {}'.format(len(corpus_features), corpus_features[0])) corpus_features = second_order(corpus_features, second_order_aggregates, verbose=False) # print(' out: len(corpus_features) = {}, corpus_features[0] = {}'.format(len(corpus_features), corpus_features[0])) data_dict[feature] = np.squeeze(corpus_features) # add segment ids song_ids = [] segments = [] for song_id in segment_dict.keys(): for segment in segment_dict[song_id]: song_ids.append(song_id) segments.append(segment) data_dict['song.id'] = np.array(song_ids) data_dict['segment.id'] = np.array(segments) # convert to dataframe return pd.DataFrame(data_dict)
def load_graph(dataset, labels_is_onehot=True): features = read_feature("./data/" + dataset + ".feature", is_normalize=False) if os.path.exists("./data/" + dataset + ".label"): labels = read_label("./data/" + dataset + ".label", is_onehot=labels_is_onehot) else: labels = None G = read_graph("./data/" + dataset + '.edgelist') graph = Graph(features, G, labels) return graph
def main(): logging.info('reading data') train_mat = read_rating('data/ml-1m/normalTrain.csv') test_mat = read_rating('data/ml-1m/test.csv') item_mat = read_feature('data/ml-1m/itemFeat.csv') num_item_feat = item_mat.shape[1] model = CollaborativeDeepLearning(item_mat, [num_item_feat, 16, 8]) model.pretrain(lamda_w=0.001, encoder_noise=0.3, epochs=10) model_history = model.fineture(train_mat, test_mat, lamda_u=0.01, lamda_v=0.1, lamda_n=0.1, lr=0.01, epochs=3) testing_rmse = model.getRMSE(test_mat) print('Testing RMSE = {}'.format(testing_rmse))
def padding(DB, feature_directory, label_directory, padding_time): num_of_DB = len(DB) for i in range(num_of_DB): feat_path = DB['filename'][i] output_file = feat_path.split('/')[-1] output_file_path = os.path.join(feature_directory, output_file) label_path = DB['label_path'][i] output_label = label_path.split('/')[-1] output_label = output_label.split('.')[0] + '.pkl' output_label_path = os.path.join(label_directory, output_label) feature, label = read_feature(feat_path, label_path) start_seg_feat, end_seg_feat = feature[:100], feature[-100:] start_seg_label, end_seg_label = label[:100], label[-100:] # Data have already been padded with 1 seconds silence at at both ends of speech. if padding_time == 0.0: final_feature = feature[100:-100] final_label = label[100:-100] elif padding_time == 2.0: final_feature = np.concatenate((start_seg_feat, feature, end_seg_feat)) final_label = np.concatenate((start_seg_label, label, end_seg_label)) elif padding_time == 3.0: final_feature = np.concatenate((start_seg_feat, start_seg_feat, feature, end_seg_feat, end_seg_feat)) final_label = np.concatenate((start_seg_label, start_seg_label, label, end_seg_label, end_seg_label)) else: raise ValueError if os.path.isfile(output_file_path): print('\'' + output_file + '\'' + 'feature already extracted!') else: with open(output_file_path, 'wb') as fp: pickle.dump(final_feature, fp) print('[Padding] Feature : %s is done!' % output_file) if os.path.isfile(output_label_path): print('\'' + output_label + '\'' + 'label already extracted!') else: with open(output_label_path, 'wb') as fp: pickle.dump(final_label, fp) print('[Padding] Label : %s is done!' % output_label)
dic = f.pickle_op(file_name='saved_pickle/{}.p'.format(args.input), mode='r') y = dic['y'] # 26 x 27 x = dic['x'] # 26 x 27 x 30 x 6080 or 26 x 513 x 30 x 419 x = f.sub_baseline(x) ''' ff.wavelet_power(x[0][0][0]) f, t, Zxx = signal.stft(x[0][0][0],200) print(f.shape) print(t.shape) ''' #------------------------------- read feature ------------------------------- # -------------------------------uncomment the feature you want------------------------------- FT_psd, FT_psd_log = f.read_feature('{}_feature_pickle/{}/FT_psd.p'.format( args.input_type, args.dir)) T_mean_power, T_mean_power_log = f.read_feature( '{}_feature_pickle/{}/T_mean_power.p'.format(args.input_type, args.dir)) T_mean, T_mean_log = f.read_feature('{}_feature_pickle/{}/T_mean.p'.format( args.input_type, args.dir)) T_std, T_std_log = f.read_feature('{}_feature_pickle/{}/T_std.p'.format( args.input_type, args.dir)) T_first_diff, T_first_diff_log = f.read_feature( '{}_feature_pickle/{}/T_first_diff.p'.format(args.input_type, args.dir)) T_second_diff, T_second_diff_log = f.read_feature( '{}_feature_pickle/{}/T_second_diff.p'.format(args.input_type, args.dir)) STFT_power, STFT_power_log = f.read_feature( '{}_feature_pickle/{}/STFT_power.p'.format(args.input_type, args.dir))
def preprocessing( args ): gc.enable() time_bar = tqdm(total = 90, desc = "preprocessing") # 目的変数 target_name = 'TARGET' one_hot_encode = args.onehot_encode #=========================== # 元データ #=========================== # application_{train|test} if( args.feature_format ): df_application_train = read_feature( os.path.join(args.dataset_dir, "application_train.feature") ) df_application_test = read_feature( os.path.join(args.dataset_dir, "application_test.feature" ) ) else: df_application_train = pd.read_csv( os.path.join(args.dataset_dir, "application_train.csv" ) ) df_application_test = pd.read_csv( os.path.join(args.dataset_dir, "application_test.csv" ) ) #df_application_train.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True) #df_application_test.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True) if( args.onehot_encode ): for col in df_application_train.columns: if( df_application_train[col].dtypes == "object" ): df_application_train[col] = df_application_train[col].fillna('NA') df_application_test[col] = df_application_test[col].fillna('NA') df_application_train = pd.get_dummies( df_application_train ) df_application_test = pd.get_dummies( df_application_test ) # one-hot ecode により、学習用データには存在するがテストデータにない列が生じるので、それらを align する train_labels = df_application_train['TARGET'] # Align the training and testing data, keep only columns present in both dataframes df_application_train, df_application_test = df_application_train.align(df_application_test, join = 'inner', axis = 1) # Add the target back in df_application_train['TARGET'] = train_labels # 元データ df_train = df_application_train df_test = df_application_test #---------------------------- # ドメイン知識に基づく特徴量 #---------------------------- if( args.domain_features ): # CREDIT_INCOME_PERCENT: クライアントの収入に対する信用額の割合。 df_train['CREDIT_INCOME_PERCENT'] = df_train['AMT_CREDIT'] / df_train['AMT_INCOME_TOTAL'] df_test['CREDIT_INCOME_PERCENT'] = df_test['AMT_CREDIT'] / df_test['AMT_INCOME_TOTAL'] # ANNUITY_INCOME_PERCENT: クライアントの収入に対するローン年金の割合。 df_train['ANNUITY_INCOME_PERCENT'] = df_train['AMT_ANNUITY'] / df_train['AMT_INCOME_TOTAL'] df_test['ANNUITY_INCOME_PERCENT'] = df_test['AMT_ANNUITY'] / df_test['AMT_INCOME_TOTAL'] # CREDIT_TERM: お支払い期間を月単位で指定します。 df_train['CREDIT_TERM'] = df_train['AMT_ANNUITY'] / df_train['AMT_CREDIT'] df_test['CREDIT_TERM'] = df_test['AMT_ANNUITY'] / df_test['AMT_CREDIT'] # DAYS_EMPLOYED_PERCENT: クライアントの年齢に対する在職日数の割合。 df_train['DAYS_EMPLOYED_PERCENT'] = df_train['DAYS_EMPLOYED'] / df_train['DAYS_BIRTH'] df_test['DAYS_EMPLOYED_PERCENT'] = df_test['DAYS_EMPLOYED'] / df_test['DAYS_BIRTH'] time_bar.update(10) #=========================== # サブ構造の結合 #=========================== #--------------------------- # bureau #--------------------------- if( args.feature_format ): df_bureau = read_feature( os.path.join(args.dataset_dir, "bureau.feature") ) else: df_bureau = pd.read_csv( os.path.join(args.dataset_dir, "bureau.csv" ) ) df_bureau_agg_numric = agg_dataframe_numric( df_bureau, agg_column = 'SK_ID_CURR', base_column_name = "bureau" ) df_bureau_agg_categorical = agg_dataframe_categorical( df_bureau, agg_column = 'SK_ID_CURR', base_column_name = "bureau", one_hot_encode = one_hot_encode ) # 元のデータに統合 df_train = pd.merge(df_train, df_bureau_agg_numric, on='SK_ID_CURR', how='left' ) df_train = pd.merge(df_train, df_bureau_agg_categorical, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_bureau_agg_numric, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_bureau_agg_categorical, on='SK_ID_CURR', how='left' ) # 不要になったメモリを解放 del df_bureau_agg_numric, df_bureau_agg_categorical gc.collect() time_bar.update(10) #--------------------------- # bureau_balance #--------------------------- if( args.feature_format ): df_bureau_balance = read_feature( os.path.join(args.dataset_dir, "bureau_balance.feature") ) else: df_bureau_balance = pd.read_csv( os.path.join(args.dataset_dir, "bureau_balance.csv" ) ) # 同じ SK_ID_BUREAU を集約 df_bureau_balance_agg_numric = agg_dataframe_numric( df_bureau_balance, agg_column = 'SK_ID_BUREAU', base_column_name = "bureau_balance" ) df_bureau_balance_agg_categorical = agg_dataframe_categorical( df_bureau_balance, agg_column = 'SK_ID_BUREAU', base_column_name = "bureau_balance", one_hot_encode = one_hot_encode ) # 親データ (df_bureau) の 'SK_ID_CURR' に、対応する 'SK_ID_BUREAU' を紐付け df_bureau_balance_agg_numric = df_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(df_bureau_balance_agg_numric, on = 'SK_ID_BUREAU', how = 'left') df_bureau_balance_agg_categorical = df_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(df_bureau_balance_agg_categorical, on = 'SK_ID_BUREAU', how = 'left') # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約 df_bureau_balance_agg_numric = agg_dataframe_numric( df_bureau_balance_agg_numric.drop(columns = ['SK_ID_BUREAU']), agg_column = 'SK_ID_CURR', base_column_name = "bureau_balance" ) df_bureau_balance_agg_categorical = agg_dataframe_numric( df_bureau_balance_agg_categorical.drop(columns = ['SK_ID_BUREAU']), agg_column = 'SK_ID_CURR', base_column_name = "bureau_balance" ) # 元のデータに統合 df_train = pd.merge(df_train, df_bureau_balance_agg_numric, on='SK_ID_CURR', how='left' ) df_train = pd.merge(df_train, df_bureau_balance_agg_categorical, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_bureau_balance_agg_numric, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_bureau_balance_agg_categorical, on='SK_ID_CURR', how='left' ) # 不要になったメモリを解放 del df_bureau, df_bureau_balance, df_bureau_balance_agg_numric, df_bureau_balance_agg_categorical gc.collect() time_bar.update(10) #--------------------------- # previous_application #-------------------------- if( args.feature_format ): df_previous_application = read_feature( os.path.join(args.dataset_dir, "previous_application.feature") ) else: df_previous_application = pd.read_csv( os.path.join(args.dataset_dir, "previous_application.csv" ) ) df_previous_application_agg_numric = agg_dataframe_numric( df_previous_application, agg_column = 'SK_ID_CURR', base_column_name = "previous_application" ) df_previous_application_agg_categorical = agg_dataframe_categorical( df_previous_application, agg_column = 'SK_ID_CURR', base_column_name = "previous_application", one_hot_encode = one_hot_encode ) # 元のデータに統合 df_train = pd.merge(df_train, df_previous_application_agg_numric, on='SK_ID_CURR', how='left' ) df_train = pd.merge(df_train, df_previous_application_agg_categorical, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_previous_application_agg_numric, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_previous_application_agg_categorical, on='SK_ID_CURR', how='left' ) # 不要になったメモリを解放 del df_previous_application_agg_numric, df_previous_application_agg_categorical gc.collect() time_bar.update(10) #--------------------------- # pos_cash_balance #--------------------------- if( args.feature_format ): df_pos_cash_balance = read_feature( os.path.join(args.dataset_dir, "POS_CASH_balance.feature") ) else: df_pos_cash_balance = pd.read_csv( os.path.join(args.dataset_dir, "POS_CASH_balance.csv" ) ) # 同じ SK_ID_PREV を集約 df_pos_cash_balance_agg_numric = agg_dataframe_numric( df_pos_cash_balance, agg_column = 'SK_ID_PREV', base_column_name = "pos_cash_balance" ) df_pos_cash_balance_agg_categorical = agg_dataframe_categorical( df_pos_cash_balance, agg_column = 'SK_ID_PREV', base_column_name = "pos_cash_balance", one_hot_encode = one_hot_encode ) # 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け df_pos_cash_balance_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_pos_cash_balance_agg_numric, on = 'SK_ID_PREV', how = 'left') df_pos_cash_balance_agg_categorical = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_pos_cash_balance_agg_categorical, on = 'SK_ID_PREV', how = 'left') # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約 df_pos_cash_balance_agg_numric = agg_dataframe_numric( df_pos_cash_balance_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "pos_cash_balance" ) df_pos_cash_balance_agg_categorical = agg_dataframe_numric( df_pos_cash_balance_agg_categorical.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "pos_cash_balance" ) # 元のデータに統合 df_train = pd.merge(df_train, df_pos_cash_balance_agg_numric, on='SK_ID_CURR', how='left' ) df_train = pd.merge(df_train, df_pos_cash_balance_agg_categorical, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_pos_cash_balance_agg_numric, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_pos_cash_balance_agg_categorical, on='SK_ID_CURR', how='left' ) # 不要になったメモリを解放 del df_pos_cash_balance, df_pos_cash_balance_agg_numric, df_pos_cash_balance_agg_categorical gc.collect() time_bar.update(10) #--------------------------- # installments_payments #--------------------------- if( args.feature_format ): df_installments_payments = read_feature( os.path.join(args.dataset_dir, "installments_payments.feature") ) else: df_installments_payments = pd.read_csv( os.path.join(args.dataset_dir, "installments_payments.csv" ) ) # カテゴリーデータは存在しない # 同じ SK_ID_PREV を集約 df_installments_payments_agg_numric = agg_dataframe_numric( df_installments_payments, agg_column = 'SK_ID_PREV', base_column_name = "installments_payments" ) # 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け df_installments_payments_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_installments_payments_agg_numric, on = 'SK_ID_PREV', how = 'left') # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約 df_installments_payments_agg_numric = agg_dataframe_numric( df_installments_payments_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "installments_payments" ) # 元のデータに統合 df_train = pd.merge(df_train, df_installments_payments_agg_numric, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_installments_payments_agg_numric, on='SK_ID_CURR', how='left' ) # 不要になったメモリを解放 del df_installments_payments, df_installments_payments_agg_numric gc.collect() time_bar.update(10) #--------------------------- # credit_card_balance #--------------------------- if( args.feature_format ): df_credit_card_balance = read_feature( os.path.join(args.dataset_dir, "credit_card_balance.feature") ) else: df_credit_card_balance = pd.read_csv( os.path.join(args.dataset_dir, "credit_card_balance.csv" ) ) # 同じ SK_ID_PREV を集約 df_credit_card_balance_agg_numric = agg_dataframe_numric( df_credit_card_balance, agg_column = 'SK_ID_PREV', base_column_name = "credit_card_balance" ) df_credit_card_balance_agg_categorical = agg_dataframe_categorical( df_credit_card_balance, agg_column = 'SK_ID_PREV', base_column_name = "credit_card_balance", one_hot_encode = one_hot_encode ) # 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け df_credit_card_balance_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_credit_card_balance_agg_numric, on = 'SK_ID_PREV', how = 'left') df_credit_card_balance_agg_categorical = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_credit_card_balance_agg_categorical, on = 'SK_ID_PREV', how = 'left') # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約 df_credit_card_balance_agg_numric = agg_dataframe_numric( df_credit_card_balance_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "credit_card_balance" ) df_credit_card_balance_agg_categorical = agg_dataframe_numric( df_credit_card_balance_agg_categorical.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "credit_card_balance" ) # 元のデータに統合 df_train = pd.merge(df_train, df_credit_card_balance_agg_numric, on='SK_ID_CURR', how='left' ) df_train = pd.merge(df_train, df_credit_card_balance_agg_categorical, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_credit_card_balance_agg_numric, on='SK_ID_CURR', how='left' ) df_test = pd.merge(df_test, df_credit_card_balance_agg_categorical, on='SK_ID_CURR', how='left' ) # 不要になったメモリを解放 del df_credit_card_balance, df_credit_card_balance_agg_numric, df_credit_card_balance_agg_categorical gc.collect() time_bar.update(10) #=========================== # 特徴量の追加(結合後) #=========================== # 異常値を含む特徴量 if( args.invalid_features ): df_train['DAYS_EMPLOYED_ANOM'] = df_train["DAYS_EMPLOYED"] == 365243 # 異常値のフラグ df_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True) df_test['DAYS_EMPLOYED_ANOM'] = df_test["DAYS_EMPLOYED"] == 365243 # 異常値のフラグ df_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True) # 時系列データ if( args.time_features ): df_train['DAYS_BIRTH'] = -1 * df_train['DAYS_BIRTH'] df_test['DAYS_BIRTH'] = -1 * df_test['DAYS_BIRTH'] df_train['YEARS_BIRTH'] = df_train['DAYS_BIRTH'] / 365 df_test['YEARS_BIRTH'] = df_test['DAYS_BIRTH'] / 365 #df_train['YEARS_BINNED'] = pd.cut(df_train['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11)) #df_test['YEARS_BINNED'] = pd.cut(df_test['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11)) #---------------------------- # 目的変数と強い相関をもつ特徴量での多項式特徴量(PolynomialFeatures) #---------------------------- if( args.polynomial_features ): df_train_poly_features = df_train[ ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'] ] df_train_poly_features_target = df_train[ ["TARGET"] ] df_test_poly_features = df_test[ ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'] ] # Need to impute missing values imputer = SimpleImputer(strategy = 'median') df_train_poly_features = imputer.fit_transform(df_train_poly_features) df_test_poly_features = imputer.transform(df_test_poly_features) # Train the polynomial features and Transform the features poly_transformer = PolynomialFeatures(degree = 3) poly_transformer.fit(df_train_poly_features) df_train_poly_features = poly_transformer.transform(df_train_poly_features) df_test_poly_features = poly_transformer.transform(df_test_poly_features) # Create a dataframe of the features df_train_poly_features = pd.DataFrame( df_train_poly_features, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']) ) df_train_poly_features[target_name] = df_train_poly_features_target # Put test features into dataframe df_test_poly_features = pd.DataFrame( df_test_poly_features, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']) ) # Merge polynomial features into training dataframe df_train_poly_features['SK_ID_CURR'] = df_train['SK_ID_CURR'] df_train = pd.merge( df_train, df_train_poly_features, on = 'SK_ID_CURR', how = 'left') # Merge polnomial features into testing dataframe df_test_poly_features['SK_ID_CURR'] = df_test['SK_ID_CURR'] df_test = pd.merge( df_test, df_test_poly_features, on = 'SK_ID_CURR', how = 'left') # Align the dataframes df_train.drop(['TARGET_y'], axis=1, inplace=True) df_train = df_train.rename( columns={'TARGET_x': 'TARGET'} ) #df_train, df_test = df_train.align(df_test, join = 'inner', axis = 1) time_bar.update(10) #=========================== # 無用なデータを除外(結合後) #=========================== if 'SK_ID_CURR' in df_train.columns: df_train.drop(['SK_ID_CURR'], axis=1, inplace=True) df_test.drop(['SK_ID_CURR'], axis=1, inplace=True) if 'SK_ID_BUREAU' in df_train.columns: df_train.drop(['SK_ID_BUREAU'], axis=1, inplace=True) df_test.drop(['SK_ID_BUREAU'], axis=1, inplace=True) if 'SK_ID_PREV' in df_train.columns: df_train.drop(['SK_ID_PREV'], axis=1, inplace=True) df_test.drop(['SK_ID_PREV'], axis=1, inplace=True) #=========================== # 全特徴量を一括で処理 #=========================== # 全データセット df_data = pd.concat([df_train, df_test], sort=False) for col in df_train.columns: # 目的変数 if( col in [target_name] ): continue #----------------------------- # 欠損値の埋め合わせ #----------------------------- # NAN 値の埋め合わせ(平均値) if( col in ["OWN_CAR_AGE"] ): # データセット全体 df_data での平均値とする df_data[col].fillna(np.mean(df_data[col]), inplace=True) df_train[col].fillna(np.mean(df_data[col]), inplace=True) df_test[col].fillna(np.mean(df_data[col]), inplace=True) # NAN 値の埋め合わせ(ゼロ値)/ int 型 elif( df_train[col].dtypes in ["int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"] ): df_data[col].fillna(0, inplace=True) df_train[col].fillna(0, inplace=True) df_test[col].fillna(0, inplace=True) # NAN 値の埋め合わせ(ゼロ値)/ float 型 elif( df_train[col].dtypes in ["float16", "float32", "float64", "float128"] ): df_data[col].fillna(0.0, inplace=True) df_train[col].fillna(0.0, inplace=True) df_test[col].fillna(0.0, inplace=True) # NAN 値の補完(None値)/ object 型 else: df_data[col] = df_data[col].fillna('NA') df_train[col] = df_train[col].fillna('NA') df_test[col] = df_test[col].fillna('NA') #----------------------------- # ラベル情報のエンコード #----------------------------- if( df_train[col].dtypes == "object" ): label_encoder = LabelEncoder() label_encoder.fit(list(df_data[col])) df_train[col] = label_encoder.transform(list(df_train[col])) label_encoder = LabelEncoder() label_encoder.fit(list(df_data[col])) df_test[col] = label_encoder.transform(list(df_test[col])) #----------------------------- # 正規化処理 #----------------------------- """ if( df_train[col].dtypes in ["float16", "float32", "float64", "float128"] ): scaler = StandardScaler() scaler.fit( df_train[col].values.reshape(-1,1) ) df_train[col] = scaler.transform( df_train[col].values.reshape(-1,1) ) df_test[col] = scaler.transform( df_test[col].values.reshape(-1,1) ) """ #----------------------------- # 値が単一の特徴量をクレンジング #----------------------------- """ if( df_train[col].nunique() == 1 ): print( "remove {} : {}".format(col,df_train[col].nunique()) ) df_train.drop([col], axis=1, inplace=True) df_test.drop([col], axis=1, inplace=True) """ time_bar.update(10) gc.disable() return df_train, df_test