def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col='id') tst = pd.read_csv(test_file, index_col='id') y = trn.loss.values n_trn = trn.shape[0] trn.drop('loss', axis=1, inplace=True) cat_cols = [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info('categorical: {}, numerical: {}'.format( len(cat_cols), len(num_cols))) df = pd.concat([trn, tst], axis=0) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) df.ix[:, cat_cols] = lbe.fit_transform(df[cat_cols].values) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn, ], y, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL] trn.drop(TARGET_COL, axis=1, inplace=True) trn['time_feat'] = trn.reset_index()['time'] tst['time_feat'] = tst.reset_index()['time'] # Generate feature identifying batches # batches are 500,000 records long # This will be used to generate grouped by lagged features # batch_size= 500000 # n_trn = trn.shape[0] # for i in range(1,int((n_trn / batch_size) + 1)): # beg = (i-1)*batch_size # end = i*batch_size -1 # trn.loc[beg:end, f'batch'] = i # # n_tst = tst.shape[0] # for i in range(1,int((n_tst / batch_size) + 1)): # beg = (i-1)*batch_size # end = i*batch_size -1 # tst.loc[beg:end, f'batch'] = i with open(feature_map_file, 'w') as f: for i, col in enumerate(trn.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(trn.values, y.values, train_feature_file) save_data(tst.values, None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_header_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL].values n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) cat_cols = [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info(f'categorical: {len(cat_cols)}, numerical: {len(num_cols)}') df = pd.concat([trn, tst], axis=0) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) df[cat_cols] = lbe.fit_transform(df[cat_cols]) df[num_cols] = df[num_cols].fillna(-1) with open(feature_header_file, 'w') as f: for i, col in enumerate(df.columns): f.write(f'{col}\n') logging.info('saving features') save_data(df.values[:n_trn, ], y, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col='id') tst = pd.read_csv(test_file, index_col='id') y_trn = trn['target'] del trn['target'] trn_tst = trn.append(tst) n_trn = len(trn) logging.info('trn_shape:{}, tst_shape: {}, all shape: {}'.format( trn.shape, tst.shape, trn_tst.shape)) logging.info('One Hot Encoding categorical variables') ohe = trn_tst_ohe = OneHotEncoder(min_obs=1) trn_tst_ohe = ohe.fit_transform(trn_tst) trn_tst_ohe = trn_tst_ohe.tocsr() logging.info(f'shape of all data after OHE: {trn_tst_ohe.shape}') with open(feature_map_file, 'w') as f: for i, col in enumerate(trn.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(trn_tst_ohe[:n_trn], y_trn, train_feature_file) save_data(trn_tst_ohe[n_trn:], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn[TARGET_COL] n_trn = trn.shape[0] features = [x for x in trn.columns if x not in [ID_COL, TARGET_COL]] df = pd.concat( [trn.drop([TARGET_COL, ID_COL], axis=1), tst.drop(ID_COL, axis=1)], axis=0) logging.info('label encoding') lbe = LabelEncoder(min_obs=50) df[features] = lbe.fit_transform(df[features]) with open(feature_map_file, 'w') as f: for i, col in enumerate(features): f.write('{}\t{}\tint\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn], y.values, train_feature_file) save_data(df.values[n_trn:], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL] n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) logging.info('categorical: {trn.shape[1]})') df = pd.concat([trn, tst], axis=0) logging.info('label encoding categorical variables') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) te = TargetEncoder(cv=cv) te.fit(trn, y) df = te.transform(df) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn, ], y.values, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, encoder_name, encoder_dim, lrate, dropout, model_file, feature_map_file, n_est, n_stop, batch_size): logging.info('loading base feature files') X, y = load_data(train_file) X_tst, _ = load_data(test_file) n_trn = X.shape[0] logging.info('combining training and test features') X = sparse.vstack((X, X_tst)) autoencoder, encoder = get_model(model_name=encoder_name, input_dim=X.shape[1], encoder_dim=encoder_dim, learning_rate=lrate, dropout=dropout) logging.info('training an autoencoder') logging.info(autoencoder.summary()) i_trn, i_val = train_test_split(np.arange(X.shape[0]), test_size=.2, random_state=SEED, shuffle=True) es = EarlyStopping(monitor='val_loss', patience=n_stop) mcp = ModelCheckpoint(model_file, monitor='val_loss', save_best_only=True, save_weights_only=False) h = autoencoder.fit_generator( generator(X[i_trn], X[i_trn], batch_size), steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)), epochs=n_est, validation_data=generator(X[i_val], X[i_val], batch_size), validation_steps=int(np.ceil(len(i_val) / batch_size)), callbacks=[es, mcp]) val_losss = h.history['val_loss'] n_best = val_losss.index(min(val_losss)) + 1 autoencoder.load_weights(model_file) logging.info('best epoch={}'.format(n_best)) with open(feature_map_file, 'w') as f: for i, col in enumerate(range(encoder_dim)): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') P = encoder.predict_generator(generator(X[:n_trn], None, batch_size), steps=int(np.ceil(n_trn / batch_size))) save_data(sparse.csr_matrix(P), y, train_feature_file) P = encoder.predict_generator(generator(X[n_trn:], None, batch_size), steps=int( np.ceil( (X.shape[0] - n_trn) / batch_size))) save_data(sparse.csr_matrix(P), None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL].values n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) df = pd.concat([trn, tst], axis=0) df.fillna(-1, inplace=True) # log1p transform the power-law distributed features df['nObserve'] = df['nObserve'].apply(np.log1p) # add diff features df['d_dered_u'] = df['dered_u'] - df['u'] df['d_dered_g'] = df['dered_g'] - df['g'] df['d_dered_r'] = df['dered_r'] - df['r'] df['d_dered_i'] = df['dered_i'] - df['i'] df['d_dered_z'] = df['dered_z'] - df['z'] df['d_dered_rg'] = df['dered_r'] - df['dered_g'] df['d_dered_ig'] = df['dered_i'] - df['dered_g'] df['d_dered_zg'] = df['dered_z'] - df['dered_g'] df['d_dered_ri'] = df['dered_r'] - df['dered_i'] df['d_dered_rz'] = df['dered_r'] - df['dered_z'] df['d_dered_iz'] = df['dered_i'] - df['dered_z'] df['d_obs_det'] = df['nObserve'] - df['nDetect'] # drop redundant features df.drop([ 'airmass_z', 'airmass_i', 'airmass_r', 'airmass_g', 'u', 'g', 'r', 'i', 'nDetect', 'd_dered_rg', 'd_dered_ri' ], axis=1, inplace=True) scaler = StandardScaler() poly = PolynomialFeatures(2) X = poly.fit_transform(scaler.fit_transform(df)) feature_names = poly.get_feature_names(df.columns) with open(feature_map_file, 'w') as f: for i, col in enumerate(feature_names): f.write(f'{col}\n') logging.info('saving features') save_data(X[:n_trn, ], y, train_feature_file) save_data(X[n_trn:, ], None, test_feature_file)
def merge_sub_features(train_file, test_file, train_sub_features, test_sub_features, train_feature_file, test_feature_file, lowest): trn_subfeat = [] tst_subfeat = [] for f_trn, f_tst in zip([x for x in train_sub_features.split(' ') if x], [x for x in test_sub_features.split(' ') if x]): logging.info('Reading trn {0} tst {1}'.format(f_trn, f_tst)) X_sub_trn, _ = load_data(f_trn) X_sub_tst, _ = load_data(f_tst) if not ssp.issparse(X_sub_trn): X_sub_trn = ssp.csr_matrix(X_sub_trn) X_sub_tst = ssp.csr_matrix(X_sub_tst) trn_subfeat.append(X_sub_trn) tst_subfeat.append(X_sub_tst) logging.info('Size trn {0} tst {1}'.format(X_sub_trn.shape, X_sub_tst.shape)) df_train = pd.read_csv(train_file) y_train = df_train[TARGET].values logging.info('Merge sub features') X_trn = ssp.hstack(trn_subfeat).tocsr() X_tst = ssp.hstack(tst_subfeat).tocsr() logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape)) drop = feature_selection.DropInactive(lowest) drop.fit(X_trn) X_trn = drop.transform(X_trn) X_tst = drop.transform(X_tst) logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape)) logging.info('saving features') save_data(X_trn, y_train, train_feature_file) save_data(X_tst, None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') train = pd.read_csv(train_file, index_col='id') test = pd.read_csv(test_file, index_col='id') y_trn = train['target'] del train['target'] N_train = len(train) N_test = len(test) train_test = train.append(test) logging.info('trn_shape:{}, tst_shape: {}, all shape: {}'.format( train.shape, test.shape, train_test.shape)) logging.info( 'Create features for entity embedding: fill in missing with mode') features = [x for x in train.columns if x not in ["id"]] for feat in features: lbl_enc = preprocessing.LabelEncoder() fillin_val = train_test[feat].mode() train_test[feat] = lbl_enc.fit_transform( train_test[feat].fillna(fillin_val).astype(str).values) train = train_test[:N_train].reset_index(drop=True) test = train_test[N_train:].reset_index(drop=True) assert ((test.loc[:, features].values.shape[1]) == 23) test_data = [ test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1]) ] with open(feature_map_file, 'w') as f: for i, col in enumerate(train.columns): if col != 'id': f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(train, y_trn, train_feature_file) save_data(test, None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) trn['year_2017'] = trn.date.apply(lambda x: x.year - 2016) tst['year_2017'] = tst.date.apply(lambda x: x.year - 2016) trn['month'] = trn.date.apply(lambda x: x.month) tst['month'] = tst.date.apply(lambda x: x.month) y = trn.target.values n_trn = trn.shape[0] trn.drop(['target', 'date', 'f_19'], axis=1, inplace=True) tst.drop(['id', 'date', 'f_19'], axis=1, inplace=True) cat_cols = ['customer_id' ] + [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info('categorical: {}, numerical: {}'.format( len(cat_cols), len(num_cols))) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values) tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values) with open(feature_map_file, 'w') as f: for i, col in enumerate(trn.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(trn.values.astype(float), y, train_feature_file) save_data(tst.values.astype(float), None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col='id') tst = pd.read_csv(test_file, index_col='id') y = trn.loss.values n_trn = trn.shape[0] trn.drop('loss', axis=1, inplace=True) cat_cols = [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info('categorical: {}, numerical: {}'.format(len(cat_cols), len(num_cols))) df = pd.concat([trn, tst], axis=0) logging.info('normalizing numeric features') nm = Normalizer() df.ix[:, num_cols] = nm.fit_transform(df[num_cols].values) logging.info('label encoding categorical variables') ohe = OneHotEncoder(min_obs=10) X_ohe = ohe.fit_transform(df[cat_cols].values) ohe_cols = ['ohe{}'.format(i) for i in range(X_ohe.shape[1])] X = sparse.hstack((df[num_cols].values, X_ohe), format='csr') with open(feature_map_file, 'w') as f: for i, col in enumerate(num_cols + ohe_cols): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(X[:n_trn,], y, train_feature_file) save_data(X[n_trn:,], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn[TARGET_COL] n_trn = trn.shape[0] features = [x for x in trn.columns if x not in [ID_COL, TARGET_COL]] logging.info('target encoding') cv = StratifiedKFold(n_splits=N_FOLD, random_state=SEED) te = TargetEncoder(cv=cv) trn[features] = te.fit_transform(trn[features], y) tst[features] = te.transform(tst[features]) with open(feature_map_file, 'w') as f: for i, col in enumerate(features): f.write('{}\t{}\tint\n'.format(i, col)) logging.info('saving features') save_data(trn[features].values, y.values, train_feature_file) save_data(tst[features].values, None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_header_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL].values n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) cat_cols = [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info(f'categorical: {len(cat_cols)}, numerical: {len(num_cols)}') df = pd.concat([trn, tst], axis=0) logging.info('normalizing numeric features') nm = Normalizer() df[num_cols] = nm.fit_transform(df[num_cols].values) logging.info('label encoding categorical variables') ohe = OneHotEncoder(min_obs=10) X_ohe = ohe.fit_transform(df[cat_cols]) ohe_cols = [f'ohe{i}' for i in range(X_ohe.shape[1])] X = sparse.hstack((df[num_cols].values, X_ohe), format='csr') with open(feature_header_file, 'w') as f: for i, col in enumerate(num_cols + ohe_cols): f.write(f'{col}\n') logging.info('saving features') save_data(X[:n_trn, ], y, train_feature_file) save_data(X[n_trn:, ], None, test_feature_file)
from __future__ import division import argparse import pandas as pd import numpy as np import os import sklearn.metrics as sm from kaggler.data_io import load_data, save_data FEATURES = ['len_q1','len_q2','diff_len','len_char_q1','len_char_q2','len_word_q1','len_word_q2','common_words', \ 'fuzz_qratio','fuzz_WRatio','fuzz_partial_ratio','fuzz_partial_token_set_ratio','fuzz_partial_token_sort_ratio','fuzz_token_set_ratio','fuzz_token_sort_ratio', \ 'wmd','norm_wmd',\ 'cosine_distance','cityblock_distance','jaccard_distance','canberra_distance','euclidean_distance','minkowski_distance','braycurtis_distance',\ 'skew_q1vec','skew_q2vec','kur_q1vec','kur_q2vec'] if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input', '-i', required=True, dest='input_file') parser.add_argument('--output', '-o', required=True, dest='output_file') args = parser.parse_args() df = pd.read_csv(args.input_file) save_data(df[FEATURES].values, None, args.output_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn[TARGET_COL].values n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) # Fill empty and NaNs values with NaN trn = trn.fillna(np.nan) tst = tst.fillna(np.nan) # Fill Null in Fare about test tst.Fare.fillna(tst.Fare.median()) # Apply log to Fare to reduce skewneww distribution trn.Fare = trn.Fare.map(lambda i: np.log(i) if i > 0 else 0) tst.Fare = tst.Fare.map(lambda i: np.log(i) if i > 0 else 0) # Fill Null in Embarked about train trn.Embarked.fillna('S') # convert Embarked into categorical value trn.Embarked = trn.Embarked.map({'S': 0, 'Q': 1, 'C': 2}) tst.Embarked = tst.Embarked.map({'S': 0, 'Q': 1, 'C': 2}) # convert Sex into categorical value 0 for male and 1 for female trn.Sex = trn.Sex.map({'male': 0, 'female': 1}) tst.Sex = tst.Sex.map({'male': 0, 'female': 1}) ## Fill Age with the median age of similar rows ## according to Pclass, Parch and SibSp # Index of NaN age rows about train train_index_nan_age = list(trn.Age[trn.Age.isnull()].index) for i in train_index_nan_age: age_med = trn.Age.median() age_pred = trn.Age[(trn.SibSp == trn.iloc[i].SibSp) & (trn.Parch == trn.iloc[i].Parch) & (trn.Pclass == trn.iloc[i].Pclass)].median() if not np.isnan(age_pred): trn.Age.iat[i] = age_pred else: trn.Age.iat[i] = age_med test_index_nan_age = list(tst.Age[tst.Age.isnull()].index) for i in test_index_nan_age: age_med = tst.Age.median() age_pred = tst.Age[(tst.SibSp == tst.iloc[i].SibSp) & (tst.Parch == tst.iloc[i].Parch) & (tst.Pclass == tst.iloc[i].Pclass)].median() if not np.isnan(age_pred): tst.Age.iat[i] = age_pred else: tst.Age.iat[i] = age_med # Get title from Name trn_title = [i.split(',')[1].split('.')[0].strip() for i in trn.Name] trn['Title'] = pd.Series(trn_title) tst_title = [i.split(',')[1].split('.')[0].strip() for i in tst.Name] tst['Title'] = pd.Series(tst_title) # Convert to categorical values Title trn.Title = trn.Title.replace([ 'Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona' ], 'Rare') trn.Title = trn.Title.map({ 'Master': 0, 'Miss': 1, 'Ms': 1, 'Mme': 1, 'Mlle': 1, 'Mrs': 1, 'Mr': 2, 'Rare': 3 }) trn.Title = trn.Title.astype(int) tst.Title = tst.Title.replace([ 'Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona' ], 'Rare') tst.Title = tst.Title.map({ 'Master': 0, 'Miss': 1, 'Ms': 1, 'Mme': 1, 'Mlle': 1, 'Mrs': 1, 'Mr': 2, 'Rare': 3 }) tst.Title = tst.Title.astype(int) # Create a family size descriptor from SibSp and Parch trn['Fsize'] = trn.SibSp + trn.Parch + 1 tst['Fsize'] = tst.SibSp + tst.Parch + 1 # Create new feature of family size trn['Single'] = trn.Fsize.map(lambda s: 1 if s == 1 else 0) trn['SmallF'] = trn.Fsize.map(lambda s: 1 if s == 2 else 0) trn['MedF'] = trn.Fsize.map(lambda s: 1 if 3 <= s <= 4 else 0) trn['LargeF'] = trn.Fsize.map(lambda s: 1 if s >= 5 else 0) tst['Single'] = tst.Fsize.map(lambda s: 1 if s == 1 else 0) tst['SmallF'] = tst.Fsize.map(lambda s: 1 if s == 2 else 0) tst['MedF'] = tst.Fsize.map(lambda s: 1 if 3 <= s <= 4 else 0) tst['LargeF'] = tst.Fsize.map(lambda s: 1 if s >= 5 else 0) # convert to indicator values Title and Embarked trn = pd.get_dummies(trn, columns=['Title']) tst = pd.get_dummies(tst, columns=['Embarked'], prefix='Em') # Replace the Cabin number by the type of cabin 'X' if not trn.Cabin = pd.Series(['X' if pd.isnull(i) else i[0] for i in trn.Cabin]) tst.Cabin = pd.Series(['X' if pd.isnull(i) else i[0] for i in tst.Cabin]) # convert to indicator values Cabin trn = pd.get_dummies(trn, columns=['Cabin'], prefix='Cabin') tst = pd.get_dummies(tst, columns=['Cabin'], prefix='Cabin') # Treat Ticket by extracting the ticket prefix. # When there is no prefix it returns X. trn_ticket = [] for i in list(trn.Ticket): if i.isdigit(): trn_ticket.append('X') else: trn_ticket.append( i.replace('.', '').replace('/', '').strip().split(' ')[0]) trn.Ticket = trn_ticket trn = pd.get_dummies(trn, columns=['Ticket'], prefix='T') tst_ticket = [] for i in list(tst.Ticket): if i.isdigit(): tst_ticket.append('X') else: tst_ticket.append( i.replace('.', '').replace('/', '').strip().split(' ')[0]) tst.Ticket = tst_ticket tst = pd.get_dummies(tst, columns=['Ticket'], prefix='T') # Create categorical valeus for Pclass trn.Pclass = trn.Pclass.astype('category') trn = pd.get_dummies(trn, columns=['Pclass'], prefix='Pc') tst.Pclass = tst.Pclass.astype('category') tst = pd.get_dummies(tst, columns=['Pclass'], prefix='Pc') # drop redundant features trn.drop(['Name', 'PassengerId'], axis=1, inplace=True) tst.drop(['Name', 'PassengerId'], axis=1, inplace=True) # concat trn and tst df = pd.concat([trn, tst], axis=0) df.fillna(-1, inplace=True) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write(f'{col}\n') logging.info('saving features') save_data(df.values[:n_trn, ], y, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) logging.info('label encoding categorical variables') y = trn.loc[:, TARGET_COL] n_trn = trn.shape[0] trn = trn.drop(TARGET_COL, axis=1) df = pd.concat([trn, tst], axis=0) # build features features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4'] features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'] features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'] features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5'] features_cyc = ['day', 'month'] logging.info("Dummy encode: bin 0 to 4") # convert bins 0, 1, 2 to object so that # get_dummies recognizes them and creates missing indicators bin_012 = ['bin_0', 'bin_1', 'bin_2'] df[bin_012] = df[bin_012].astype(object) dummies = pd.get_dummies(df[features_bin], dummy_na=True) df = df.drop(features_bin, axis=1) df = pd.concat([df, dummies], axis=1) logging.info("Target encoding: nom 0 to 9 and cyclical features") target_enc_cols = features_ord + features_cat + features_hex + features_cyc cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) te = TargetEncoder(cv=cv) te.fit(trn.loc[:, target_enc_cols], y) df.loc[:, target_enc_cols] = te.transform(df.loc[:, target_enc_cols]) # logging.info("Label encode ordinals: ord 0 to 5") # map_ord_0 = None # already a numeric column # map_ord_1 = {'Novice': 1, 'Contributor': 2, # 'Expert': 3, 'Master': 4, 'Grandmaster': 5} # map_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3, # 'Hot': 4, 'Boiling Hot': 5, 'Lava Hot': 6} # map_ord_3 = dict(zip(df['ord_3'].value_counts().sort_index().keys(), # range(1, len(df['ord_3'].value_counts()) + 1))) # map_ord_4 = dict(zip(df['ord_4'].value_counts().sort_index().keys(), # range(1, len(df['ord_4'].value_counts()) + 1))) # # temp_ord_5 = pd.DataFrame( # df['ord_5'].value_counts().sort_index().keys(), columns=['ord_5']) # temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper() # temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper() # temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4) # temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4) # temp_ord_5['Add'] = temp_ord_5['First'] + temp_ord_5['Second'] # temp_ord_5['Mul'] = temp_ord_5['First'] * temp_ord_5['Second'] # map_ord_5 = dict(zip(temp_ord_5['ord_5'], # temp_ord_5['Mul'])) # # maps = [map_ord_0, map_ord_1, map_ord_2, map_ord_3, map_ord_4, map_ord_5] # for i, m in zip(range(0, 6), maps): # if i != 0: # df[f'ord_{i}'] = df[f'ord_{i}'].map(m) # df[f'ord_{i}'] = (df[f'ord_{i}'].fillna(df[f'ord_{i}'].median())) # logging.info("cyclical features") # df[features_cyc] = df[features_cyc].astype(object) # dummies_cyc = pd.get_dummies(df[features_cyc], dummy_na=True) # df = df.drop(features_cyc, axis=1) # df = pd.concat([df, dummies_cyc], axis=1) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn, ], y.values, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
print('Generate bow') bow_extractor = CountVectorizer(max_df=0.999, min_df=50, max_features=1000000, analyzer='word', ngram_range=(1, 6), stop_words='english', binary=True, lowercase=True) corpus = [] for f in feats: data_all[f] = data_all[f].astype(str) corpus += data_all[f].values.tolist() bow_extractor.fit(corpus) for f in feats: bow = bow_extractor.transform(data_all[f].values.tolist()) train_bow = bow[:train.shape[0]] test_bow = bow[train.shape[0]:] if 'question1' in f: save_data(train_bow, None, args.q1_train_output_file) save_data(test_bow, None, args.q1_test_output_file) else: save_data(train_bow, None, args.q2_train_output_file) save_data(test_bow, None, args.q2_test_output_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn[TARGET_COL].values n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) # view of trn and tst full_data = [trn, tst] # Some features of my own that I have added in # Gives the length of the name for dataset in full_data: dataset['Name_length'] = dataset.Name.apply(len) # Feature that tells whether a passenger had a cabin and Titanic for dataset in full_data: dataset['Has_Cabin'] = dataset.Cabin.apply(lambda x: 0 if type(x) == float else 1) # Feature engineering steps taken from Sina # Create new feature FamilySize as a combination of SibSp and Parch for dataset in full_data: dataset['FamilySize'] = dataset.SibSp + dataset.Parch + 1 # Create new feature IsAlone from FamilySize for dataset in full_data: dataset['IsAlone'] = 0 dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1 # Remove all NULLS in the Embarked column for dataset in full_data: dataset.Embarked = dataset.Embarked.fillna('S') # Remove all NULLS in the Fare column for dataset in full_data: dataset.Fare = dataset.Fare.fillna(dataset.Fare.median()) # Remove all NULLS in the Age column for dataset in full_data: age_avg = dataset.Age.mean() age_std = dataset.Age.std() age_null_count = dataset.Age.isnull().sum() age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count) dataset.Age[np.isnan(dataset.Age)] = age_null_random_list dataset.Age = dataset.Age.astype(int) # Define function to extract titles from passenger names def get_title(name): title_search = re.search(' ([A-Za-z]+)\.', name) # If the title exists, extract and return it. if title_search: return title_search.group(1) return "" # Create a new feature Title, containing the titles of passenger names for dataset in full_data: dataset.Title = dataset.Name.apply(get_title) # Group all non-common titles into one single grouping 'Rare' for dataset in full_data: dataset.Title = dataset.Title.replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') dataset.Title = dataset.Title.replace('Mlle', 'Miss') dataset.Title = dataset.Title.replace('Ms', 'Miss') dataset.Title = dataset.Title.replace('Mme', 'Mrs') for dataset in full_data: # Mapping Sex dataset.Sex = dataset.Sex.map({'female': 0, 'male': 1}).astype(int) # Mapping Titles title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5} dataset.Title = dataset.Title.map(title_mapping) dataset.Title = dataset.Title.fillna(0) # Mapping Embarked dataset.Embarked = dataset.Embarked.map({'S': 0, 'C': 1, 'Q': 2}).astype(int) # Mapping Fare def map_fare(x): if x <= 7.91: return 0 elif x <= 14.454: return 1 elif x <= 31: return 2 else: return 3 dataset.Fare = dataset.Fare.apply(map_fare) dataset.Fare = dataset.Fare.astype(int) # Mapping Age def map_age(x): if x <= 16: return 0 elif x <= 32: return 1 elif x <= 48: return 2 elif x <= 64: return 3 else: return 4 dataset.Age = dataset.Age.apply(map_age) # drop redundant features drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'] trn.drop(drop_elements, axis=1, inplace=True) tst.drop(drop_elements, axis=1, inplace=True) # concat trn and tst df = pd.concat([trn, tst], axis=0) df.fillna(-1, inplace=True) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write(f'{col}\n') logging.info('saving features') save_data(df.values[:n_trn, ], y, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
dest='test_output_file') args = parser.parse_args() train = pd.read_csv(args.train_file).astype(str) test = pd.read_csv(args.test_file).astype(str) feats = [] print('Generate intersection') train['question_intersection'] = train.astype(str).apply( lambda x: calc_set_intersection(x['question1'], x['question2']), axis=1) test['question_intersection'] = test.astype(str).apply( lambda x: calc_set_intersection(x['question1'], x['question2']), axis=1) feats.append('question_intersection') print('Generate porter intersection') train['question_porter_intersection'] = train.astype(str).apply( lambda x: calc_set_intersection(x['question1_porter'], x[ 'question2_porter']), axis=1) test['question_porter_intersection'] = test.astype(str).apply( lambda x: calc_set_intersection(x['question1_porter'], x[ 'question2_porter']), axis=1) feats.append('question_porter_intersection') save_data(train[feats].values, None, args.train_output_file) save_data(test[feats].values, None, args.test_output_file)
features.append('freq_min') features.append('freq_max') features.append('freq_meam') features.append('freq_diff1') features.append('freq_diff2') train_comb = comb[comb['is_duplicate'] >= 0][features] test_comb = comb[comb['is_duplicate'] < 0][features] return train_comb, test_comb if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--train-file', required=True, dest='train_file') parser.add_argument('--test-file', required=True, dest='test_file') parser.add_argument('--train-output-file', required=True, dest='train_output_file') parser.add_argument('--test-output-file', required=True, dest='test_output_file') args = parser.parse_args() train = pd.read_csv(args.train_file).astype(str) test = pd.read_csv(args.test_file).astype(str) train_feats, test_feats = generate_frequency_features(train, test, 'question1', 'question2') train_feats_porter, test_feats_porter = generate_frequency_features(train, test, 'question1_porter', 'question2_porter') train_magic = pd.concat([train_feats, train_feats_porter], axis=1) test_magic = pd.concat([test_feats, test_feats_porter], axis=1) save_data(train_magic.values, None, args.train_output_file) save_data(test_magic.values, None, args.test_output_file)
q2_vec, _ = load_data(vec_files[1]) distances = [] for d in sklearn.metrics.pairwise.PAIRED_DISTANCES.keys( ): #['euclidean', 'cosine', 'l2', 'l1', 'cityblock', 'manhattan'] distances.append( sklearn.metrics.pairwise.paired_distances(q1_vec, q2_vec, metric=d)) return np.transpose(np.vstack(distances)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--train-vec-files', required=True, dest='train_files') parser.add_argument('--test-vec-files', required=True, dest='test_files') parser.add_argument('--train-output-file', required=True, dest='train_output_file') parser.add_argument('--test-output-file', required=True, dest='test_output_file') args = parser.parse_args() train_dis = calculate_distance(args.train_files.split(' ')) test_dis = calculate_distance(args.test_files.split(' ')) save_data(train_dis, None, args.train_output_file) save_data(test_dis, None, args.test_output_file)
ngram_range = (1,2) min_df = 3 print('Generate ' + str(args.question_col) + ' tfidf') feats = ['question1','question2'] if args.question_col: feats = [ '_'.join([x, args.question_col]) for x in feats] vect_orig = TfidfVectorizer(max_features=max_features,ngram_range=ngram_range, min_df=min_df) corpus = [] for f in feats: data_all[f] = data_all[f].astype(str) corpus+=data_all[f].values.tolist() vect_orig.fit(corpus) for f in feats: tfidfs = vect_orig.transform(data_all[f].values.tolist()) train_tfidf = tfidfs[:train.shape[0]] test_tfidf = tfidfs[train.shape[0]:] if 'question1' in f: save_data(train_tfidf, None, args.q1_train_output_file) save_data(test_tfidf, None, args.q1_test_output_file) else: save_data(train_tfidf, None, args.q2_train_output_file) save_data(test_tfidf, None, args.q2_test_output_file)
logging.info('combining base features for training data') is_sparse = False Xs = [] for base_feature in args.base_train_features: X, y = load_data(base_feature) is_sparse = sparse.issparse(X) or is_sparse Xs.append(X) if is_sparse: X = sparse.hstack(Xs).todense() else: X = np.hstack(Xs) idx = np.array(X.std(axis=0) != 0).reshape(-1, ) X = X[:, idx] save_data(X, y, args.train_feature_file) logging.info('combining base features for test data') Xs = [] for base_feature in args.base_test_features: X, y = load_data(base_feature) Xs.append(X) if is_sparse: X = sparse.hstack(Xs).todense() else: X = np.hstack(Xs) X = X[:, idx] save_data(X, y, args.test_feature_file)
print('Creating count vector') counts_vectorizer = CountVectorizer(max_features=10000 - 1).fit( itertools.chain(df_all['question1'], df_all['question2'])) other_index = len(counts_vectorizer.vocabulary_) X1_train_all = create_padded_seqs(df_all[df_all['id'].notnull()]['question1']) y_train_all = df_all[df_all['id'].notnull()]['is_duplicate'].values X2_train_all = create_padded_seqs(df_all[df_all['id'].notnull()]['question2']) X1_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question1']) X2_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question2']) X1_train, X1_val, X2_train, X2_val, y_train, y_val = \ train_test_split(X1_train_all, X2_train_all, y_train_all, stratify=y_train_all, test_size=0.3, random_state=1989) feats = [] model = LSTMModel() model.fit(X1_train, X2_train, X1_val, X2_val, y_train, y_val) train_features = model.extractFeatures(X1_train_all, X2_train_all) test_features = model.extractFeatures(X1_test, X2_test) save_data(train_features.astype(float), None, args.train_output_file) save_data(test_features.astype(float), None, args.test_output_file) features = range(train_features.shape[1]) feature_map=["lstm_{feature}".format(feature=feature) for feature in features]
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn.target.values n_trn = trn.shape[0] logging.info('splitting customer_ids into first 5 and next 3 digits') trn['cid_5'] = trn.customer_id // 1e7 tst['cid_5'] = tst.customer_id // 1e7 trn['cid_3'] = (trn.customer_id // 1e4) % 1e3 tst['cid_3'] = (tst.customer_id // 1e4) % 1e3 logging.info('drop unused columns') trn.drop(COLS_TO_DROP, axis=1, inplace=True) tst.drop(['id'] + COLS_TO_DROP, axis=1, inplace=True) cat_cols = ['cid_5', 'cid_3'] + [x for x in trn.columns if trn[x].dtype == np.object] float_cols = [x for x in trn.columns if trn[x].dtype == np.float64] int_cols = [x for x in trn.columns if (trn[x].dtype == np.int64) & (x not in ['cid_5', 'cid_3'])] logging.info('categorical: {}, float: {}, int: {}'.format(len(cat_cols), len(float_cols), len(int_cols))) logging.info('min-max scaling float columns') scaler = MinMaxScaler() trn.ix[:, float_cols] = scaler.fit_transform(trn[float_cols].values) tst.ix[:, float_cols] = scaler.transform(tst[float_cols].values) logging.info('adding interactions with f_5') interaction_cols = ['f_8', 'f_12', 'f_18', 'f_11'] feature_cols = [] for col in interaction_cols: trn['f_5+{}'.format(col)] = trn.f_5 * 10 + trn[col] tst['f_5+{}'.format(col)] = tst.f_5 * 10 + tst[col] feature_cols.append('f_5+{}'.format(col)) for col1, col2 in combinations(interaction_cols, 2): logging.info('adding interactions between {} and {}'.format(col1, col2)) trn['{}+{}'.format(col1, col2)] = trn[col1] + trn[col2] tst['{}+{}'.format(col1, col2)] = tst[col1] + tst[col2] trn['{}-{}'.format(col1, col2)] = trn[col1] - trn[col2] tst['{}-{}'.format(col1, col2)] = tst[col1] - tst[col2] trn['{}x{}'.format(col1, col2)] = trn[col1].apply(np.log1p) + trn[col2].apply(np.log1p) tst['{}x{}'.format(col1, col2)] = tst[col1].apply(np.log1p) + tst[col2].apply(np.log1p) trn['{}/{}'.format(col1, col2)] = trn[col1].apply(np.log1p) - trn[col2].apply(np.log1p) tst['{}/{}'.format(col1, col2)] = tst[col1].apply(np.log1p) - tst[col2].apply(np.log1p) feature_cols += ['{}+{}'.format(col1, col2), '{}-{}'.format(col1, col2), '{}x{}'.format(col1, col2), '{}/{}'.format(col1, col2)] logging.info('saving non-CV features') save_data(trn[feature_cols].values.astype(float), y, train_feature_file) save_data(tst[feature_cols].values.astype(float), None, test_feature_file) logging.info('generate CV features') feature_name, feature_ext = os.path.splitext(train_feature_file) feature_name = os.path.splitext(feature_name)[0] cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) for i, (i_trn, i_val) in enumerate(cv.split(y), 1): cv_feature_cols = [] logging.info('mean-target encoding for categorical columns for CV #{}'.format(i)) cv_trn = trn[cat_cols + [TARGET]].copy() cv_tst = tst[cat_cols].copy() for col in cat_cols: mean_target = cv_trn.iloc[i_trn][[col, 'target']].groupby(col).mean() mapping = mean_target.to_dict()['target'] cv_trn[col] = cv_trn[col].map(mapping) cv_tst[col] = cv_tst[col].map(mapping) cv_feature_cols += cat_cols logging.info('adding min, max, mean of mean-target encodings of categorical columns') cv_trn['min_target_encoding'] = cv_trn[cat_cols].min(axis=1) cv_trn['max_target_encoding'] = cv_trn[cat_cols].max(axis=1) cv_trn['median_target_encoding'] = cv_trn[cat_cols].median(axis=1) cv_tst['min_target_encoding'] = cv_tst[cat_cols].min(axis=1) cv_tst['max_target_encoding'] = cv_tst[cat_cols].max(axis=1) cv_tst['median_target_encoding'] = cv_tst[cat_cols].median(axis=1) cv_feature_cols += ['min_target_encoding', 'max_target_encoding', 'median_target_encoding'] logging.info('saving features for CV #{}'.format(i)) save_data(cv_trn[cv_feature_cols].values.astype(float), y, '{}.trn{}{}'.format(feature_name, i, feature_ext)) save_data(cv_tst[cv_feature_cols].values.astype(float), None, '{}.tst{}{}'.format(feature_name, i, feature_ext)) with open(feature_map_file, 'w') as f: for i, col in enumerate(feature_cols + cv_feature_cols): f.write('{}\t{}\tq\n'.format(i, col))
dest='test_output_file') args = parser.parse_args() train = pd.read_csv(args.train_file).astype(str) test = pd.read_csv(args.test_file).astype(str) df = pd.concat([train, test]) g = nx.Graph() g.add_nodes_from(df.question1) g.add_nodes_from(df.question2) edges = list(df[['question1', 'question2']].to_records(index=False)) g.add_edges_from(edges) def get_intersection_count(row): return (len( set(g.neighbors(row.question1)).intersection( set(g.neighbors(row.question2))))) train_ic = pd.DataFrame() test_ic = pd.DataFrame() train['intersection_count'] = train.apply( lambda row: get_intersection_count(row), axis=1) test['intersection_count'] = test.apply( lambda row: get_intersection_count(row), axis=1) save_data(train[['intersection_count']].values, None, args.train_output_file) save_data(test[['intersection_count']].values, None, args.test_output_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) logging.info('converting the date column into datetime') trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) logging.info('add year and month features') trn['year_2017'] = trn.date.dt.year - 2016 tst['year_2017'] = tst.date.dt.year - 2016 trn['month'] = trn.date.dt.month tst['month'] = tst.date.dt.month y = trn.target.values n_trn = trn.shape[0] logging.info('splitting customer ids into first 8 digits') trn.customer_id = trn.customer_id // 1e7 tst.customer_id = tst.customer_id // 1e7 logging.info('drop unused columns') trn.drop(['target', 'date', 'f_19'], axis=1, inplace=True) tst.drop(['id', 'date', 'f_19'], axis=1, inplace=True) cat_cols = ['customer_id' ] + [x for x in trn.columns if trn[x].dtype == np.object] float_cols = [x for x in trn.columns if trn[x].dtype == np.float64] int_cols = [ x for x in trn.columns if (trn[x].dtype == np.int64) & (x != 'customer_id') ] logging.info('categorical: {}, float: {}, int: {}'.format( len(cat_cols), len(float_cols), len(int_cols))) logging.info('label encoding categorical variables') ohe = OneHotEncoder(min_obs=100) df = pd.concat([trn, tst], axis=0) X_cat = ohe.fit_transform(df[int_cols + cat_cols].values) logging.info('min-max scaling float columns') scaler = MinMaxScaler() X_num = scaler.fit_transform(df[float_cols].values) X = sparse.hstack((X_num, X_cat)).tocsr() with open(feature_map_file, 'w') as f: for i, col in enumerate(range(X.shape[1])): if i < X_num.shape[1]: f.write('{}\t{}\tq\n'.format(i, col)) else: f.write('{}\t{}\ti\n'.format(i, col)) logging.info('saving features') save_data(X[:n_trn], y, train_feature_file) save_data(X[n_trn:], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL].values n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) # Fill Null in Age using title trn['Initial'] = trn.Name.str.extract('([A-Za-z]+)\.') tst['Initial'] = tst.Name.str.extract('([A-Za-z]+)\.') trn['Initial'].replace([ 'Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don', 'Dona' ], [ 'Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr', 'Mr' ], inplace=True) trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Mr'), 'Age'] = 33 trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Mrs'), 'Age'] = 36 trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Master'), 'Age'] = 5 trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Miss'), 'Age'] = 22 trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Other'), 'Age'] = 46 tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Mr'), 'Age'] = 33 tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Mrs'), 'Age'] = 36 tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Master'), 'Age'] = 5 tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Miss'), 'Age'] = 22 tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Other'), 'Age'] = 46 # Fill Null in Embarked trn['Embarked'].fillna('S', inplace=True) # add Age_band feature def categorize_age(x): if x <= 16: return 0 elif x > 16 and x <= 32: return 1 elif x > 32 and x <= 48: return 2 elif x > 48 and x <= 64: return 3 else: return 4 trn['Age_band'] = trn.Age.apply(categorize_age) tst['Age_band'] = tst.Age.apply(categorize_age) # add Family_Size and Alone trn['Family_Size'] = 0 trn['Family_Size'] = trn['SibSp'] + trn['Parch'] trn['Alone'] = 0 trn.loc[trn.Family_Size == 0, 'Alone'] = 1 tst['Family_Size'] = 0 tst['Family_Size'] = tst['SibSp'] + tst['Parch'] tst['Alone'] = 0 tst.loc[tst.Family_Size == 0, 'Alone'] = 1 # add Fare_cat def categorize_fare(x): if x <= 7.91: return 0 elif x > 7.91 and x <= 14.454: return 1 elif x > 14.454 and x <= 31: return 2 else: return 3 trn['Fare_cat'] = trn.Fare.apply(categorize_fare) tst['Fare_cat'] = tst.Fare.apply(categorize_fare) # Change Initial, Embarked and Sex (string to numerical) trn.Initial = trn.Initial.map({ 'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4 }) tst.Initial = tst.Initial.map({ 'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4 }) trn.Embarked = trn.Embarked.map({'C': 0, 'Q': 1, 'S': 2}) tst.Embarked = tst.Embarked.map({'C': 0, 'Q': 1, 'S': 2}) trn.Sex = trn.Sex.map({'female': 0, 'male': 1}) tst.Sex = tst.Sex.map({'female': 0, 'male': 1}) # drop redundant features trn.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True) tst.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True) # concat trn and tst df = pd.concat([trn, tst], axis=0) df.fillna(-1, inplace=True) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write(f'{col}\n') logging.info('saving features') save_data(df.values[:n_trn, ], y, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, target_label_file, test_feature_file, feature_map_file): logging.info('loading raw data') # 대회 데이터 로드 및 타겟 값 로드 trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) # 이상치 제거 # test의 MinMax 범위 넘는 행은 train에서 제거 n_trn = trn.shape[0] for col in trn.columns[:18]: trn = trn.loc[np.logical_and(trn[col] >= tst[col].min(), trn[col] <= tst[col].max())] logging.info(f'Number of rows removed :{n_trn - trn.shape[0]}') n_trn = trn.shape[0];y = trn[TARGET_COL].values # 데이터셋을 합쳐서 한꺼번에 가공 trn.drop(TARGET_COL, axis=1, inplace=True) dataset = pd.concat([trn,tst], axis=0) dataset.fillna(-1, inplace=True) # 새로운 feature에 만들때 사용할 필드 선택 wave_columns = dataset.columns.drop(['nObserve', 'nDetect', 'redshift']) # 선택한 필드들을 가지고 앞뒤 간의 차를 이용해서 새로운 변수 생성 for j in range(14): name = 'diff_' + str(wave_columns[j+1]) + '_' + str(wave_columns[j]) dataset[name] = dataset[wave_columns[j+1]] - dataset[wave_columns[j]] logging.info(f'{wave_columns[j+1]} - {wave_columns[j]} {j}') # 선택한 필드들을 가지고 15포인트 랭킹 변수 생성 mag_rank = dataset[wave_columns].rank(axis=1) rank_col = [] for col in trn[wave_columns].columns: col = col + '_rank' rank_col.append(col) mag_rank.columns = rank_col dataset = pd.concat([dataset, mag_rank], axis=1) # 선택한 필드들을 가지고 # 측정방법별 파장 차이 비교 변수 생성 diff_col = [] for col in ['u','g','r','i','z']: for i in range(2): diff_col.append(col + '_' + str(i)) mag_wave = pd.DataFrame(np.zeros((dataset.shape[0],10)), index=dataset.index) for i in range(0,10,5): for j in range(5): mag_wave.loc[:, j+i] = dataset[wave_columns[j]] - dataset[wave_columns[5+j+i]] logging.info(f'{wave_columns[j]} - {wave_columns[5+j+i]} {i+j}') # 새롭게 만든 변수들을 대회 데이터체 추가 mag_wave.columns = diff_col dataset = pd.concat([dataset, mag_wave], axis=1) # 멱함수 분포를 정규 분포를 만들기 위해서, np.log1p를 사용 # 그리고 nObserve 와 nDetect 차를 새로운 변수로 생성 dataset['nObserve'] = dataset['nObserve'].apply(np.log1p) dataset['d_obs_det'] = dataset['nObserve'] - dataset['nDetect'] # permutation importance를 사용해서, 사용할 필드 선택 drop_columns = ['d_obs_det','g_0','diff_airmass_z_airmass_i','u','airmass_g','airmass_z','nDetect','dered_i_rank','diff_airmass_r_airmass_g','dered_r_rank','dered_g_rank','g_rank','airmass_i_rank','airmass_r_rank','airmass_g_rank','airmass_z_rank','dered_u_rank','r_rank','diff_airmass_u_dered_z','u_rank','z_rank','dered_z_rank','airmass_u_rank','diff_airmass_i_airmass_r','i_rank','airmass_r','z'] # 필요없는 필드 제거 dataset = dataset.drop(drop_columns, axis=1).copy() # 만들어진 변수들을 저장 with open(feature_map_file, 'w') as f: for i, col in enumerate(dataset.columns): f.write(f'{col}\n') logging.info('saving features') save_data(dataset.values[:n_trn,:], y, train_feature_file) save_data(dataset.values[n_trn:,:], None,test_feature_file) logging.info('saving target label') np.savetxt(target_label_file, y, fmt='%d', delimiter=',')