Exemplo n.º 1
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col='id')
    tst = pd.read_csv(test_file, index_col='id')

    y = trn.loss.values
    n_trn = trn.shape[0]

    trn.drop('loss', axis=1, inplace=True)

    cat_cols = [x for x in trn.columns if trn[x].dtype == np.object]
    num_cols = [x for x in trn.columns if trn[x].dtype != np.object]

    logging.info('categorical: {}, numerical: {}'.format(
        len(cat_cols), len(num_cols)))

    df = pd.concat([trn, tst], axis=0)

    logging.info('label encoding categorical variables')
    lbe = LabelEncoder(min_obs=10)
    df.ix[:, cat_cols] = lbe.fit_transform(df[cat_cols].values)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y, train_feature_file)
    save_data(df.values[n_trn:, ], None, test_feature_file)
Exemplo n.º 2
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    y = trn[TARGET_COL]
    trn.drop(TARGET_COL, axis=1, inplace=True)

    trn['time_feat'] = trn.reset_index()['time']
    tst['time_feat'] = tst.reset_index()['time']

    # Generate feature identifying batches
    # batches are 500,000 records long
    # This will be used to generate grouped by lagged features
#    batch_size= 500000
#    n_trn = trn.shape[0]
#    for i in range(1,int((n_trn / batch_size) + 1)):
#	beg = (i-1)*batch_size
#	end = i*batch_size -1
#	trn.loc[beg:end, f'batch'] = i
#
#    n_tst = tst.shape[0]
#    for i in range(1,int((n_tst / batch_size) + 1)):
#	beg = (i-1)*batch_size
#	end = i*batch_size -1
#	tst.loc[beg:end, f'batch'] = i

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(trn.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(trn.values, y.values, train_feature_file)
    save_data(tst.values, None, test_feature_file)
Exemplo n.º 3
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_header_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    y = trn[TARGET_COL].values
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)

    cat_cols = [x for x in trn.columns if trn[x].dtype == np.object]
    num_cols = [x for x in trn.columns if trn[x].dtype != np.object]

    logging.info(f'categorical: {len(cat_cols)}, numerical: {len(num_cols)}')

    df = pd.concat([trn, tst], axis=0)

    logging.info('label encoding categorical variables')
    lbe = LabelEncoder(min_obs=10)
    df[cat_cols] = lbe.fit_transform(df[cat_cols])
    df[num_cols] = df[num_cols].fillna(-1)

    with open(feature_header_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write(f'{col}\n')

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y, train_feature_file)
    save_data(df.values[n_trn:, ], None, test_feature_file)
Exemplo n.º 4
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col='id')
    tst = pd.read_csv(test_file, index_col='id')

    y_trn = trn['target']
    del trn['target']
    trn_tst = trn.append(tst)
    n_trn = len(trn)
    logging.info('trn_shape:{}, tst_shape: {}, all shape: {}'.format(
        trn.shape, tst.shape, trn_tst.shape))

    logging.info('One Hot Encoding categorical variables')
    ohe = trn_tst_ohe = OneHotEncoder(min_obs=1)
    trn_tst_ohe = ohe.fit_transform(trn_tst)
    trn_tst_ohe = trn_tst_ohe.tocsr()
    logging.info(f'shape of all data after OHE: {trn_tst_ohe.shape}')

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(trn.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(trn_tst_ohe[:n_trn], y_trn, train_feature_file)
    save_data(trn_tst_ohe[n_trn:], None, test_feature_file)
Exemplo n.º 5
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn[TARGET_COL]
    n_trn = trn.shape[0]

    features = [x for x in trn.columns if x not in [ID_COL, TARGET_COL]]

    df = pd.concat(
        [trn.drop([TARGET_COL, ID_COL], axis=1),
         tst.drop(ID_COL, axis=1)],
        axis=0)

    logging.info('label encoding')
    lbe = LabelEncoder(min_obs=50)
    df[features] = lbe.fit_transform(df[features])

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(features):
            f.write('{}\t{}\tint\n'.format(i, col))

    logging.info('saving features')
    save_data(df.values[:n_trn], y.values, train_feature_file)
    save_data(df.values[n_trn:], None, test_feature_file)
Exemplo n.º 6
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    y = trn[TARGET_COL]
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)
    logging.info('categorical: {trn.shape[1]})')

    df = pd.concat([trn, tst], axis=0)

    logging.info('label encoding categorical variables')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    te = TargetEncoder(cv=cv)
    te.fit(trn, y)
    df = te.transform(df)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y.values, train_feature_file)
    save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, encoder_name, encoder_dim, lrate,
                     dropout, model_file, feature_map_file, n_est, n_stop,
                     batch_size):
    logging.info('loading base feature files')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)
    n_trn = X.shape[0]

    logging.info('combining training and test features')
    X = sparse.vstack((X, X_tst))

    autoencoder, encoder = get_model(model_name=encoder_name,
                                     input_dim=X.shape[1],
                                     encoder_dim=encoder_dim,
                                     learning_rate=lrate,
                                     dropout=dropout)
    logging.info('training an autoencoder')
    logging.info(autoencoder.summary())

    i_trn, i_val = train_test_split(np.arange(X.shape[0]),
                                    test_size=.2,
                                    random_state=SEED,
                                    shuffle=True)

    es = EarlyStopping(monitor='val_loss', patience=n_stop)
    mcp = ModelCheckpoint(model_file,
                          monitor='val_loss',
                          save_best_only=True,
                          save_weights_only=False)
    h = autoencoder.fit_generator(
        generator(X[i_trn], X[i_trn], batch_size),
        steps_per_epoch=int(np.ceil(len(i_trn) / batch_size)),
        epochs=n_est,
        validation_data=generator(X[i_val], X[i_val], batch_size),
        validation_steps=int(np.ceil(len(i_val) / batch_size)),
        callbacks=[es, mcp])

    val_losss = h.history['val_loss']
    n_best = val_losss.index(min(val_losss)) + 1
    autoencoder.load_weights(model_file)
    logging.info('best epoch={}'.format(n_best))

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(range(encoder_dim)):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    P = encoder.predict_generator(generator(X[:n_trn], None, batch_size),
                                  steps=int(np.ceil(n_trn / batch_size)))
    save_data(sparse.csr_matrix(P), y, train_feature_file)

    P = encoder.predict_generator(generator(X[n_trn:], None, batch_size),
                                  steps=int(
                                      np.ceil(
                                          (X.shape[0] - n_trn) / batch_size)))
    save_data(sparse.csr_matrix(P), None, test_feature_file)
Exemplo n.º 8
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    y = trn[TARGET_COL].values
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)

    df = pd.concat([trn, tst], axis=0)
    df.fillna(-1, inplace=True)

    # log1p transform the power-law distributed features
    df['nObserve'] = df['nObserve'].apply(np.log1p)

    # add diff features
    df['d_dered_u'] = df['dered_u'] - df['u']
    df['d_dered_g'] = df['dered_g'] - df['g']
    df['d_dered_r'] = df['dered_r'] - df['r']
    df['d_dered_i'] = df['dered_i'] - df['i']
    df['d_dered_z'] = df['dered_z'] - df['z']
    df['d_dered_rg'] = df['dered_r'] - df['dered_g']
    df['d_dered_ig'] = df['dered_i'] - df['dered_g']
    df['d_dered_zg'] = df['dered_z'] - df['dered_g']
    df['d_dered_ri'] = df['dered_r'] - df['dered_i']
    df['d_dered_rz'] = df['dered_r'] - df['dered_z']
    df['d_dered_iz'] = df['dered_i'] - df['dered_z']
    df['d_obs_det'] = df['nObserve'] - df['nDetect']

    # drop redundant features
    df.drop([
        'airmass_z', 'airmass_i', 'airmass_r', 'airmass_g', 'u', 'g', 'r', 'i',
        'nDetect', 'd_dered_rg', 'd_dered_ri'
    ],
            axis=1,
            inplace=True)

    scaler = StandardScaler()
    poly = PolynomialFeatures(2)
    X = poly.fit_transform(scaler.fit_transform(df))
    feature_names = poly.get_feature_names(df.columns)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(feature_names):
            f.write(f'{col}\n')

    logging.info('saving features')
    save_data(X[:n_trn, ], y, train_feature_file)
    save_data(X[n_trn:, ], None, test_feature_file)
def merge_sub_features(train_file, test_file, train_sub_features,
                       test_sub_features, train_feature_file,
                       test_feature_file, lowest):

    trn_subfeat = []
    tst_subfeat = []

    for f_trn, f_tst in zip([x for x in train_sub_features.split(' ') if x],
                            [x for x in test_sub_features.split(' ') if x]):
        logging.info('Reading trn {0} tst {1}'.format(f_trn, f_tst))

        X_sub_trn, _ = load_data(f_trn)
        X_sub_tst, _ = load_data(f_tst)

        if not ssp.issparse(X_sub_trn):
            X_sub_trn = ssp.csr_matrix(X_sub_trn)
            X_sub_tst = ssp.csr_matrix(X_sub_tst)

        trn_subfeat.append(X_sub_trn)
        tst_subfeat.append(X_sub_tst)

        logging.info('Size trn {0} tst {1}'.format(X_sub_trn.shape,
                                                   X_sub_tst.shape))

    df_train = pd.read_csv(train_file)
    y_train = df_train[TARGET].values

    logging.info('Merge sub features')
    X_trn = ssp.hstack(trn_subfeat).tocsr()
    X_tst = ssp.hstack(tst_subfeat).tocsr()
    logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape))

    drop = feature_selection.DropInactive(lowest)

    drop.fit(X_trn)
    X_trn = drop.transform(X_trn)
    X_tst = drop.transform(X_tst)

    logging.info('Size trn {0} tst {1}'.format(X_trn.shape, X_tst.shape))

    logging.info('saving features')
    save_data(X_trn, y_train, train_feature_file)
    save_data(X_tst, None, test_feature_file)
Exemplo n.º 10
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    train = pd.read_csv(train_file, index_col='id')
    test = pd.read_csv(test_file, index_col='id')

    y_trn = train['target']
    del train['target']
    N_train = len(train)
    N_test = len(test)
    train_test = train.append(test)
    logging.info('trn_shape:{}, tst_shape: {}, all shape: {}'.format(
        train.shape, test.shape, train_test.shape))

    logging.info(
        'Create features for entity embedding: fill in missing with mode')

    features = [x for x in train.columns if x not in ["id"]]
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        fillin_val = train_test[feat].mode()
        train_test[feat] = lbl_enc.fit_transform(
            train_test[feat].fillna(fillin_val).astype(str).values)

    train = train_test[:N_train].reset_index(drop=True)
    test = train_test[N_train:].reset_index(drop=True)

    assert ((test.loc[:, features].values.shape[1]) == 23)

    test_data = [
        test.loc[:, features].values[:, k]
        for k in range(test.loc[:, features].values.shape[1])
    ]

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(train.columns):
            if col != 'id':
                f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(train, y_trn, train_feature_file)
    save_data(test, None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))
    tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))

    trn['year_2017'] = trn.date.apply(lambda x: x.year - 2016)
    tst['year_2017'] = tst.date.apply(lambda x: x.year - 2016)

    trn['month'] = trn.date.apply(lambda x: x.month)
    tst['month'] = tst.date.apply(lambda x: x.month)

    y = trn.target.values

    n_trn = trn.shape[0]

    trn.drop(['target', 'date', 'f_19'], axis=1, inplace=True)
    tst.drop(['id', 'date', 'f_19'], axis=1, inplace=True)

    cat_cols = ['customer_id'
                ] + [x for x in trn.columns if trn[x].dtype == np.object]
    num_cols = [x for x in trn.columns if trn[x].dtype != np.object]

    logging.info('categorical: {}, numerical: {}'.format(
        len(cat_cols), len(num_cols)))

    logging.info('label encoding categorical variables')
    lbe = LabelEncoder(min_obs=10)
    trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values)
    tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(trn.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(trn.values.astype(float), y, train_feature_file)
    save_data(tst.values.astype(float), None, test_feature_file)
Exemplo n.º 12
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col='id')
    tst = pd.read_csv(test_file, index_col='id')

    y = trn.loss.values
    n_trn = trn.shape[0]

    trn.drop('loss', axis=1, inplace=True)

    cat_cols = [x for x in trn.columns if trn[x].dtype == np.object]
    num_cols = [x for x in trn.columns if trn[x].dtype != np.object]

    logging.info('categorical: {}, numerical: {}'.format(len(cat_cols),
                                                         len(num_cols)))

    df = pd.concat([trn, tst], axis=0)

    logging.info('normalizing numeric features')
    nm = Normalizer()
    df.ix[:, num_cols] = nm.fit_transform(df[num_cols].values)

    logging.info('label encoding categorical variables')
    ohe = OneHotEncoder(min_obs=10)
    X_ohe = ohe.fit_transform(df[cat_cols].values)
    ohe_cols = ['ohe{}'.format(i) for i in range(X_ohe.shape[1])]

    X = sparse.hstack((df[num_cols].values, X_ohe), format='csr')

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(num_cols + ohe_cols):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(X[:n_trn,], y, train_feature_file)
    save_data(X[n_trn:,], None, test_feature_file)
Exemplo n.º 13
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn[TARGET_COL]
    n_trn = trn.shape[0]

    features = [x for x in trn.columns if x not in [ID_COL, TARGET_COL]]

    logging.info('target encoding')
    cv = StratifiedKFold(n_splits=N_FOLD, random_state=SEED)
    te = TargetEncoder(cv=cv)
    trn[features] = te.fit_transform(trn[features], y)
    tst[features] = te.transform(tst[features])

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(features):
            f.write('{}\t{}\tint\n'.format(i, col))

    logging.info('saving features')
    save_data(trn[features].values, y.values, train_feature_file)
    save_data(tst[features].values, None, test_feature_file)
Exemplo n.º 14
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_header_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    y = trn[TARGET_COL].values
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)

    cat_cols = [x for x in trn.columns if trn[x].dtype == np.object]
    num_cols = [x for x in trn.columns if trn[x].dtype != np.object]

    logging.info(f'categorical: {len(cat_cols)}, numerical: {len(num_cols)}')

    df = pd.concat([trn, tst], axis=0)

    logging.info('normalizing numeric features')
    nm = Normalizer()
    df[num_cols] = nm.fit_transform(df[num_cols].values)

    logging.info('label encoding categorical variables')
    ohe = OneHotEncoder(min_obs=10)
    X_ohe = ohe.fit_transform(df[cat_cols])
    ohe_cols = [f'ohe{i}' for i in range(X_ohe.shape[1])]

    X = sparse.hstack((df[num_cols].values, X_ohe), format='csr')

    with open(feature_header_file, 'w') as f:
        for i, col in enumerate(num_cols + ohe_cols):
            f.write(f'{col}\n')

    logging.info('saving features')
    save_data(X[:n_trn, ], y, train_feature_file)
    save_data(X[n_trn:, ], None, test_feature_file)
Exemplo n.º 15
0
from __future__ import division

import argparse
import pandas as pd
import numpy as np
import os
import sklearn.metrics as sm
from kaggler.data_io import load_data, save_data

FEATURES = ['len_q1','len_q2','diff_len','len_char_q1','len_char_q2','len_word_q1','len_word_q2','common_words', \
             'fuzz_qratio','fuzz_WRatio','fuzz_partial_ratio','fuzz_partial_token_set_ratio','fuzz_partial_token_sort_ratio','fuzz_token_set_ratio','fuzz_token_sort_ratio', \
             'wmd','norm_wmd',\
             'cosine_distance','cityblock_distance','jaccard_distance','canberra_distance','euclidean_distance','minkowski_distance','braycurtis_distance',\
             'skew_q1vec','skew_q2vec','kur_q1vec','kur_q2vec']

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', '-i', required=True, dest='input_file')
    parser.add_argument('--output', '-o', required=True, dest='output_file')
    args = parser.parse_args()

    df = pd.read_csv(args.input_file)

    save_data(df[FEATURES].values, None, args.output_file)
Exemplo n.º 16
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn[TARGET_COL].values
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)

    # Fill empty and NaNs values with NaN
    trn = trn.fillna(np.nan)
    tst = tst.fillna(np.nan)

    # Fill Null in Fare about test
    tst.Fare.fillna(tst.Fare.median())

    # Apply log to Fare to reduce skewneww distribution
    trn.Fare = trn.Fare.map(lambda i: np.log(i) if i > 0 else 0)
    tst.Fare = tst.Fare.map(lambda i: np.log(i) if i > 0 else 0)

    # Fill Null in Embarked about train
    trn.Embarked.fillna('S')

    # convert Embarked into categorical value
    trn.Embarked = trn.Embarked.map({'S': 0, 'Q': 1, 'C': 2})
    tst.Embarked = tst.Embarked.map({'S': 0, 'Q': 1, 'C': 2})

    # convert Sex into categorical value 0 for male and 1 for female
    trn.Sex = trn.Sex.map({'male': 0, 'female': 1})
    tst.Sex = tst.Sex.map({'male': 0, 'female': 1})

    ## Fill Age with the median age of similar rows
    ## according to Pclass, Parch and SibSp

    # Index of NaN age rows about train
    train_index_nan_age = list(trn.Age[trn.Age.isnull()].index)
    for i in train_index_nan_age:
        age_med = trn.Age.median()
        age_pred = trn.Age[(trn.SibSp == trn.iloc[i].SibSp)
                           & (trn.Parch == trn.iloc[i].Parch) &
                           (trn.Pclass == trn.iloc[i].Pclass)].median()
        if not np.isnan(age_pred):
            trn.Age.iat[i] = age_pred
        else:
            trn.Age.iat[i] = age_med

    test_index_nan_age = list(tst.Age[tst.Age.isnull()].index)
    for i in test_index_nan_age:
        age_med = tst.Age.median()
        age_pred = tst.Age[(tst.SibSp == tst.iloc[i].SibSp)
                           & (tst.Parch == tst.iloc[i].Parch) &
                           (tst.Pclass == tst.iloc[i].Pclass)].median()
        if not np.isnan(age_pred):
            tst.Age.iat[i] = age_pred
        else:
            tst.Age.iat[i] = age_med

    # Get title from Name
    trn_title = [i.split(',')[1].split('.')[0].strip() for i in trn.Name]
    trn['Title'] = pd.Series(trn_title)
    tst_title = [i.split(',')[1].split('.')[0].strip() for i in tst.Name]
    tst['Title'] = pd.Series(tst_title)

    # Convert to categorical values Title
    trn.Title = trn.Title.replace([
        'Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
        'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'
    ], 'Rare')
    trn.Title = trn.Title.map({
        'Master': 0,
        'Miss': 1,
        'Ms': 1,
        'Mme': 1,
        'Mlle': 1,
        'Mrs': 1,
        'Mr': 2,
        'Rare': 3
    })
    trn.Title = trn.Title.astype(int)

    tst.Title = tst.Title.replace([
        'Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
        'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'
    ], 'Rare')
    tst.Title = tst.Title.map({
        'Master': 0,
        'Miss': 1,
        'Ms': 1,
        'Mme': 1,
        'Mlle': 1,
        'Mrs': 1,
        'Mr': 2,
        'Rare': 3
    })
    tst.Title = tst.Title.astype(int)

    # Create a family size descriptor from SibSp and Parch
    trn['Fsize'] = trn.SibSp + trn.Parch + 1
    tst['Fsize'] = tst.SibSp + tst.Parch + 1

    # Create new feature of family size
    trn['Single'] = trn.Fsize.map(lambda s: 1 if s == 1 else 0)
    trn['SmallF'] = trn.Fsize.map(lambda s: 1 if s == 2 else 0)
    trn['MedF'] = trn.Fsize.map(lambda s: 1 if 3 <= s <= 4 else 0)
    trn['LargeF'] = trn.Fsize.map(lambda s: 1 if s >= 5 else 0)

    tst['Single'] = tst.Fsize.map(lambda s: 1 if s == 1 else 0)
    tst['SmallF'] = tst.Fsize.map(lambda s: 1 if s == 2 else 0)
    tst['MedF'] = tst.Fsize.map(lambda s: 1 if 3 <= s <= 4 else 0)
    tst['LargeF'] = tst.Fsize.map(lambda s: 1 if s >= 5 else 0)

    # convert to indicator values Title and Embarked
    trn = pd.get_dummies(trn, columns=['Title'])
    tst = pd.get_dummies(tst, columns=['Embarked'], prefix='Em')

    # Replace the Cabin number by the type of cabin 'X' if not
    trn.Cabin = pd.Series(['X' if pd.isnull(i) else i[0] for i in trn.Cabin])
    tst.Cabin = pd.Series(['X' if pd.isnull(i) else i[0] for i in tst.Cabin])

    # convert to indicator values Cabin
    trn = pd.get_dummies(trn, columns=['Cabin'], prefix='Cabin')
    tst = pd.get_dummies(tst, columns=['Cabin'], prefix='Cabin')

    # Treat Ticket by extracting the ticket prefix.
    # When there is no prefix it returns X.
    trn_ticket = []
    for i in list(trn.Ticket):
        if i.isdigit():
            trn_ticket.append('X')
        else:
            trn_ticket.append(
                i.replace('.', '').replace('/', '').strip().split(' ')[0])
    trn.Ticket = trn_ticket
    trn = pd.get_dummies(trn, columns=['Ticket'], prefix='T')

    tst_ticket = []
    for i in list(tst.Ticket):
        if i.isdigit():
            tst_ticket.append('X')
        else:
            tst_ticket.append(
                i.replace('.', '').replace('/', '').strip().split(' ')[0])
    tst.Ticket = tst_ticket
    tst = pd.get_dummies(tst, columns=['Ticket'], prefix='T')

    # Create categorical valeus for Pclass
    trn.Pclass = trn.Pclass.astype('category')
    trn = pd.get_dummies(trn, columns=['Pclass'], prefix='Pc')

    tst.Pclass = tst.Pclass.astype('category')
    tst = pd.get_dummies(tst, columns=['Pclass'], prefix='Pc')

    # drop redundant features
    trn.drop(['Name', 'PassengerId'], axis=1, inplace=True)
    tst.drop(['Name', 'PassengerId'], axis=1, inplace=True)

    # concat trn and tst
    df = pd.concat([trn, tst], axis=0)
    df.fillna(-1, inplace=True)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write(f'{col}\n')

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y, train_feature_file)
    save_data(df.values[n_trn:, ], None, test_feature_file)
Exemplo n.º 17
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    logging.info('label encoding categorical variables')

    y = trn.loc[:, TARGET_COL]
    n_trn = trn.shape[0]
    trn = trn.drop(TARGET_COL, axis=1)
    df = pd.concat([trn, tst], axis=0)

    # build features
    features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
    features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
    features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
    features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
    features_cyc = ['day', 'month']

    logging.info("Dummy encode: bin 0 to 4")
    # convert bins 0, 1, 2 to object so that
    # get_dummies recognizes them and creates missing indicators
    bin_012 = ['bin_0', 'bin_1', 'bin_2']
    df[bin_012] = df[bin_012].astype(object)

    dummies = pd.get_dummies(df[features_bin], dummy_na=True)
    df = df.drop(features_bin, axis=1)
    df = pd.concat([df, dummies], axis=1)

    logging.info("Target encoding: nom 0 to 9 and cyclical features")
    target_enc_cols = features_ord + features_cat + features_hex + features_cyc
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    te = TargetEncoder(cv=cv)
    te.fit(trn.loc[:, target_enc_cols], y)
    df.loc[:, target_enc_cols] = te.transform(df.loc[:, target_enc_cols])

#    logging.info("Label encode ordinals: ord 0 to 5")
#    map_ord_0 = None  # already a numeric column
#    map_ord_1 = {'Novice': 1, 'Contributor': 2,
#                 'Expert': 3, 'Master': 4, 'Grandmaster': 5}
#    map_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3,
#                 'Hot': 4, 'Boiling Hot': 5, 'Lava Hot': 6}
#    map_ord_3 = dict(zip(df['ord_3'].value_counts().sort_index().keys(),
#                         range(1, len(df['ord_3'].value_counts()) + 1)))
#    map_ord_4 = dict(zip(df['ord_4'].value_counts().sort_index().keys(),
#                         range(1, len(df['ord_4'].value_counts()) + 1)))
#
#    temp_ord_5 = pd.DataFrame(
#        df['ord_5'].value_counts().sort_index().keys(), columns=['ord_5'])
#    temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper()
#    temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper()
#    temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4)
#    temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4)
#    temp_ord_5['Add'] = temp_ord_5['First'] + temp_ord_5['Second']
#    temp_ord_5['Mul'] = temp_ord_5['First'] * temp_ord_5['Second']
#    map_ord_5 = dict(zip(temp_ord_5['ord_5'],
#                         temp_ord_5['Mul']))
#
#    maps = [map_ord_0, map_ord_1, map_ord_2, map_ord_3, map_ord_4, map_ord_5]
#    for i, m in zip(range(0, 6), maps):
#        if i != 0:
#            df[f'ord_{i}'] = df[f'ord_{i}'].map(m)
#        df[f'ord_{i}'] = (df[f'ord_{i}'].fillna(df[f'ord_{i}'].median()))

#    logging.info("cyclical features")
#    df[features_cyc] = df[features_cyc].astype(object)
#    dummies_cyc = pd.get_dummies(df[features_cyc], dummy_na=True)
#    df = df.drop(features_cyc, axis=1)
#    df = pd.concat([df, dummies_cyc], axis=1)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y.values, train_feature_file)

    save_data(df.values[n_trn:, ], None, test_feature_file)
    print('Generate bow')

    bow_extractor = CountVectorizer(max_df=0.999,
                                    min_df=50,
                                    max_features=1000000,
                                    analyzer='word',
                                    ngram_range=(1, 6),
                                    stop_words='english',
                                    binary=True,
                                    lowercase=True)

    corpus = []
    for f in feats:
        data_all[f] = data_all[f].astype(str)
        corpus += data_all[f].values.tolist()

    bow_extractor.fit(corpus)

    for f in feats:
        bow = bow_extractor.transform(data_all[f].values.tolist())

        train_bow = bow[:train.shape[0]]
        test_bow = bow[train.shape[0]:]

        if 'question1' in f:
            save_data(train_bow, None, args.q1_train_output_file)
            save_data(test_bow, None, args.q1_test_output_file)
        else:
            save_data(train_bow, None, args.q2_train_output_file)
            save_data(test_bow, None, args.q2_test_output_file)
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn[TARGET_COL].values
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)

    # view of trn and tst
    full_data = [trn, tst]

    # Some features of my own that I have added in
    # Gives the length of the name
    for dataset in full_data:
        dataset['Name_length'] = dataset.Name.apply(len)

    # Feature that tells whether a passenger had a cabin and Titanic
    for dataset in full_data:
        dataset['Has_Cabin'] = dataset.Cabin.apply(lambda x: 0 if type(x) == float else 1)

    # Feature engineering steps taken from Sina
    # Create new feature FamilySize as a combination of SibSp and Parch
    for dataset in full_data:
        dataset['FamilySize'] = dataset.SibSp + dataset.Parch + 1

    # Create new feature IsAlone from FamilySize
    for dataset in full_data:
        dataset['IsAlone'] = 0
        dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

    # Remove all NULLS in the Embarked column
    for dataset in full_data:
        dataset.Embarked = dataset.Embarked.fillna('S')

    # Remove all NULLS in the Fare column
    for dataset in full_data:
        dataset.Fare = dataset.Fare.fillna(dataset.Fare.median())

    # Remove all NULLS in the Age column
    for dataset in full_data:
        age_avg = dataset.Age.mean()
        age_std = dataset.Age.std()
        age_null_count = dataset.Age.isnull().sum()
        age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)

        dataset.Age[np.isnan(dataset.Age)] = age_null_random_list
        dataset.Age = dataset.Age.astype(int)

    # Define function to extract titles from passenger names
    def get_title(name):
        title_search = re.search(' ([A-Za-z]+)\.', name)
        # If the title exists, extract and return it.
        if title_search:
            return title_search.group(1)
        return ""

    # Create a new feature Title, containing the titles of passenger names
    for dataset in full_data:
        dataset.Title = dataset.Name.apply(get_title)

    # Group all non-common titles into one single grouping 'Rare'
    for dataset in full_data:
        dataset.Title = dataset.Title.replace(['Lady', 'Countess', 'Capt', 'Col', 'Don',
                                               'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer',
                                               'Dona'], 'Rare')
        dataset.Title = dataset.Title.replace('Mlle', 'Miss')
        dataset.Title = dataset.Title.replace('Ms', 'Miss')
        dataset.Title = dataset.Title.replace('Mme', 'Mrs')

    for dataset in full_data:
        # Mapping Sex
        dataset.Sex = dataset.Sex.map({'female': 0, 'male': 1}).astype(int)

        # Mapping Titles
        title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
        dataset.Title = dataset.Title.map(title_mapping)
        dataset.Title = dataset.Title.fillna(0)

        # Mapping Embarked
        dataset.Embarked = dataset.Embarked.map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

        # Mapping Fare
        def map_fare(x):
            if x <= 7.91:
                return 0
            elif x <= 14.454:
                return 1
            elif x <= 31:
                return 2
            else:
                return 3

        dataset.Fare = dataset.Fare.apply(map_fare)
        dataset.Fare = dataset.Fare.astype(int)
    
        # Mapping Age
        def map_age(x):
            if x <= 16:
                return 0
            elif x <= 32:
                return 1
            elif x <= 48:
                return 2
            elif x <= 64:
                return 3
            else:
                return 4

        dataset.Age = dataset.Age.apply(map_age)

    # drop redundant features
    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
    trn.drop(drop_elements, axis=1, inplace=True)
    tst.drop(drop_elements, axis=1, inplace=True)

    # concat trn and tst
    df = pd.concat([trn, tst], axis=0)
    df.fillna(-1, inplace=True)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write(f'{col}\n')

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y, train_feature_file)
    save_data(df.values[n_trn:, ], None, test_feature_file)
Exemplo n.º 20
0
                        dest='test_output_file')
    args = parser.parse_args()

    train = pd.read_csv(args.train_file).astype(str)
    test = pd.read_csv(args.test_file).astype(str)

    feats = []

    print('Generate intersection')
    train['question_intersection'] = train.astype(str).apply(
        lambda x: calc_set_intersection(x['question1'], x['question2']),
        axis=1)
    test['question_intersection'] = test.astype(str).apply(
        lambda x: calc_set_intersection(x['question1'], x['question2']),
        axis=1)
    feats.append('question_intersection')

    print('Generate porter intersection')
    train['question_porter_intersection'] = train.astype(str).apply(
        lambda x: calc_set_intersection(x['question1_porter'], x[
            'question2_porter']),
        axis=1)
    test['question_porter_intersection'] = test.astype(str).apply(
        lambda x: calc_set_intersection(x['question1_porter'], x[
            'question2_porter']),
        axis=1)
    feats.append('question_porter_intersection')

    save_data(train[feats].values, None, args.train_output_file)
    save_data(test[feats].values, None, args.test_output_file)
Exemplo n.º 21
0
    features.append('freq_min')
    features.append('freq_max')
    features.append('freq_meam')
    features.append('freq_diff1')
    features.append('freq_diff2')

    train_comb = comb[comb['is_duplicate'] >= 0][features]
    test_comb = comb[comb['is_duplicate'] < 0][features]

    return train_comb, test_comb

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-file', required=True, dest='train_file')
    parser.add_argument('--test-file', required=True, dest='test_file')
    parser.add_argument('--train-output-file', required=True, dest='train_output_file')
    parser.add_argument('--test-output-file', required=True, dest='test_output_file')
    args = parser.parse_args()

    train = pd.read_csv(args.train_file).astype(str)
    test = pd.read_csv(args.test_file).astype(str)

    train_feats, test_feats = generate_frequency_features(train, test, 'question1', 'question2')
    train_feats_porter, test_feats_porter = generate_frequency_features(train, test, 'question1_porter', 'question2_porter')

    train_magic = pd.concat([train_feats, train_feats_porter], axis=1)
    test_magic = pd.concat([test_feats, test_feats_porter], axis=1)

    save_data(train_magic.values, None, args.train_output_file)
    save_data(test_magic.values, None, args.test_output_file)
Exemplo n.º 22
0
    q2_vec, _ = load_data(vec_files[1])

    distances = []

    for d in sklearn.metrics.pairwise.PAIRED_DISTANCES.keys(
    ):  #['euclidean', 'cosine', 'l2', 'l1', 'cityblock', 'manhattan']
        distances.append(
            sklearn.metrics.pairwise.paired_distances(q1_vec, q2_vec,
                                                      metric=d))

    return np.transpose(np.vstack(distances))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-vec-files', required=True, dest='train_files')
    parser.add_argument('--test-vec-files', required=True, dest='test_files')
    parser.add_argument('--train-output-file',
                        required=True,
                        dest='train_output_file')
    parser.add_argument('--test-output-file',
                        required=True,
                        dest='test_output_file')
    args = parser.parse_args()

    train_dis = calculate_distance(args.train_files.split(' '))
    test_dis = calculate_distance(args.test_files.split(' '))

    save_data(train_dis, None, args.train_output_file)
    save_data(test_dis, None, args.test_output_file)
Exemplo n.º 23
0
    ngram_range = (1,2)
    min_df = 3
    print('Generate ' + str(args.question_col) + ' tfidf')
    feats = ['question1','question2']
    
    if args.question_col:
        feats = [ '_'.join([x, args.question_col]) for x in feats]
    
    vect_orig = TfidfVectorizer(max_features=max_features,ngram_range=ngram_range, min_df=min_df)

    corpus = []
    for f in feats:
        data_all[f] = data_all[f].astype(str)
        corpus+=data_all[f].values.tolist()

    vect_orig.fit(corpus)

    for f in feats:
        tfidfs = vect_orig.transform(data_all[f].values.tolist())
    
        train_tfidf = tfidfs[:train.shape[0]]
        test_tfidf = tfidfs[train.shape[0]:]

        if 'question1' in f:
            save_data(train_tfidf, None, args.q1_train_output_file)
            save_data(test_tfidf, None, args.q1_test_output_file)
        else:
            save_data(train_tfidf, None, args.q2_train_output_file)
            save_data(test_tfidf, None, args.q2_test_output_file)

    logging.info('combining base features for training data')
    is_sparse = False
    Xs = []
    for base_feature in args.base_train_features:
        X, y = load_data(base_feature)
        is_sparse = sparse.issparse(X) or is_sparse
        Xs.append(X)

    if is_sparse:
        X = sparse.hstack(Xs).todense()
    else:
        X = np.hstack(Xs)

    idx = np.array(X.std(axis=0) != 0).reshape(-1, )
    X = X[:, idx]
    save_data(X, y, args.train_feature_file)

    logging.info('combining base features for test data')
    Xs = []
    for base_feature in args.base_test_features:
        X, y = load_data(base_feature)
        Xs.append(X)

    if is_sparse:
        X = sparse.hstack(Xs).todense()
    else:
        X = np.hstack(Xs)

    X = X[:, idx]
    save_data(X, y, args.test_feature_file)
    print('Creating count vector')
    counts_vectorizer = CountVectorizer(max_features=10000 - 1).fit(
        itertools.chain(df_all['question1'], df_all['question2']))
    other_index = len(counts_vectorizer.vocabulary_)

    X1_train_all = create_padded_seqs(df_all[df_all['id'].notnull()]['question1'])
    y_train_all = df_all[df_all['id'].notnull()]['is_duplicate'].values
    X2_train_all = create_padded_seqs(df_all[df_all['id'].notnull()]['question2'])

    X1_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question1'])
    X2_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question2'])

    X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
        train_test_split(X1_train_all, X2_train_all,
                         y_train_all,
                         stratify=y_train_all,
                         test_size=0.3, random_state=1989)

    feats = []
    model = LSTMModel()
    model.fit(X1_train, X2_train, X1_val, X2_val, y_train, y_val)

    train_features = model.extractFeatures(X1_train_all, X2_train_all)
    test_features = model.extractFeatures(X1_test, X2_test)

    save_data(train_features.astype(float), None, args.train_output_file)
    save_data(test_features.astype(float), None, args.test_output_file)

    features = range(train_features.shape[1])
    feature_map=["lstm_{feature}".format(feature=feature) for feature in features]
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn.target.values

    n_trn = trn.shape[0]

    logging.info('splitting customer_ids into first 5 and next 3 digits')
    trn['cid_5'] = trn.customer_id // 1e7
    tst['cid_5'] = tst.customer_id // 1e7
    trn['cid_3'] = (trn.customer_id // 1e4) % 1e3
    tst['cid_3'] = (tst.customer_id // 1e4) % 1e3

    logging.info('drop unused columns')
    trn.drop(COLS_TO_DROP, axis=1, inplace=True)
    tst.drop(['id'] + COLS_TO_DROP, axis=1, inplace=True)

    cat_cols = ['cid_5', 'cid_3'] + [x for x in trn.columns if trn[x].dtype == np.object]
    float_cols = [x for x in trn.columns if trn[x].dtype == np.float64]
    int_cols = [x for x in trn.columns if (trn[x].dtype == np.int64) & (x not in ['cid_5', 'cid_3'])]

    logging.info('categorical: {}, float: {}, int: {}'.format(len(cat_cols),
                                                              len(float_cols),
                                                              len(int_cols)))

    logging.info('min-max scaling float columns')
    scaler = MinMaxScaler()
    trn.ix[:, float_cols] = scaler.fit_transform(trn[float_cols].values)
    tst.ix[:, float_cols] = scaler.transform(tst[float_cols].values)

    logging.info('adding interactions with f_5')
    interaction_cols = ['f_8', 'f_12', 'f_18', 'f_11']

    feature_cols = []
    for col in interaction_cols:
        trn['f_5+{}'.format(col)] = trn.f_5 * 10 + trn[col]
        tst['f_5+{}'.format(col)] = tst.f_5 * 10 + tst[col]
        feature_cols.append('f_5+{}'.format(col))

    for col1, col2 in combinations(interaction_cols, 2):
        logging.info('adding interactions between {} and {}'.format(col1, col2))
        trn['{}+{}'.format(col1, col2)] = trn[col1] + trn[col2]
        tst['{}+{}'.format(col1, col2)] = tst[col1] + tst[col2]

        trn['{}-{}'.format(col1, col2)] = trn[col1] - trn[col2]
        tst['{}-{}'.format(col1, col2)] = tst[col1] - tst[col2]

        trn['{}x{}'.format(col1, col2)] = trn[col1].apply(np.log1p) + trn[col2].apply(np.log1p)
        tst['{}x{}'.format(col1, col2)] = tst[col1].apply(np.log1p) + tst[col2].apply(np.log1p)

        trn['{}/{}'.format(col1, col2)] = trn[col1].apply(np.log1p) - trn[col2].apply(np.log1p)
        tst['{}/{}'.format(col1, col2)] = tst[col1].apply(np.log1p) - tst[col2].apply(np.log1p)

        feature_cols += ['{}+{}'.format(col1, col2),
                         '{}-{}'.format(col1, col2),
                         '{}x{}'.format(col1, col2),
                         '{}/{}'.format(col1, col2)]

    logging.info('saving non-CV features')
    save_data(trn[feature_cols].values.astype(float), y, train_feature_file)
    save_data(tst[feature_cols].values.astype(float), None, test_feature_file)

    logging.info('generate CV features')
    feature_name, feature_ext = os.path.splitext(train_feature_file)
    feature_name = os.path.splitext(feature_name)[0]

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    for i, (i_trn, i_val) in enumerate(cv.split(y), 1):
        cv_feature_cols = []
        logging.info('mean-target encoding for categorical columns for CV #{}'.format(i))
        cv_trn = trn[cat_cols + [TARGET]].copy()
        cv_tst = tst[cat_cols].copy()
        for col in cat_cols:
            mean_target = cv_trn.iloc[i_trn][[col, 'target']].groupby(col).mean()
            mapping = mean_target.to_dict()['target']
            cv_trn[col] = cv_trn[col].map(mapping)
            cv_tst[col] = cv_tst[col].map(mapping)

        cv_feature_cols += cat_cols

        logging.info('adding min, max, mean of mean-target encodings of categorical columns')
        cv_trn['min_target_encoding'] = cv_trn[cat_cols].min(axis=1)
        cv_trn['max_target_encoding'] = cv_trn[cat_cols].max(axis=1)
        cv_trn['median_target_encoding'] = cv_trn[cat_cols].median(axis=1)
        cv_tst['min_target_encoding'] = cv_tst[cat_cols].min(axis=1)
        cv_tst['max_target_encoding'] = cv_tst[cat_cols].max(axis=1)
        cv_tst['median_target_encoding'] = cv_tst[cat_cols].median(axis=1)

        cv_feature_cols += ['min_target_encoding', 'max_target_encoding', 'median_target_encoding']

        logging.info('saving features for CV #{}'.format(i))
        save_data(cv_trn[cv_feature_cols].values.astype(float), y, '{}.trn{}{}'.format(feature_name, i, feature_ext))
        save_data(cv_tst[cv_feature_cols].values.astype(float), None, '{}.tst{}{}'.format(feature_name, i, feature_ext))

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(feature_cols + cv_feature_cols):
            f.write('{}\t{}\tq\n'.format(i, col))
Exemplo n.º 27
0
                        dest='test_output_file')
    args = parser.parse_args()

    train = pd.read_csv(args.train_file).astype(str)
    test = pd.read_csv(args.test_file).astype(str)

    df = pd.concat([train, test])

    g = nx.Graph()
    g.add_nodes_from(df.question1)
    g.add_nodes_from(df.question2)
    edges = list(df[['question1', 'question2']].to_records(index=False))
    g.add_edges_from(edges)

    def get_intersection_count(row):
        return (len(
            set(g.neighbors(row.question1)).intersection(
                set(g.neighbors(row.question2)))))

    train_ic = pd.DataFrame()
    test_ic = pd.DataFrame()

    train['intersection_count'] = train.apply(
        lambda row: get_intersection_count(row), axis=1)
    test['intersection_count'] = test.apply(
        lambda row: get_intersection_count(row), axis=1)

    save_data(train[['intersection_count']].values, None,
              args.train_output_file)
    save_data(test[['intersection_count']].values, None, args.test_output_file)
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    logging.info('converting the date column into datetime')
    trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))
    tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))

    logging.info('add year and month features')
    trn['year_2017'] = trn.date.dt.year - 2016
    tst['year_2017'] = tst.date.dt.year - 2016

    trn['month'] = trn.date.dt.month
    tst['month'] = tst.date.dt.month

    y = trn.target.values

    n_trn = trn.shape[0]

    logging.info('splitting customer ids into first 8 digits')
    trn.customer_id = trn.customer_id // 1e7
    tst.customer_id = tst.customer_id // 1e7

    logging.info('drop unused columns')
    trn.drop(['target', 'date', 'f_19'], axis=1, inplace=True)
    tst.drop(['id', 'date', 'f_19'], axis=1, inplace=True)

    cat_cols = ['customer_id'
                ] + [x for x in trn.columns if trn[x].dtype == np.object]
    float_cols = [x for x in trn.columns if trn[x].dtype == np.float64]
    int_cols = [
        x for x in trn.columns
        if (trn[x].dtype == np.int64) & (x != 'customer_id')
    ]

    logging.info('categorical: {}, float: {}, int: {}'.format(
        len(cat_cols), len(float_cols), len(int_cols)))

    logging.info('label encoding categorical variables')
    ohe = OneHotEncoder(min_obs=100)
    df = pd.concat([trn, tst], axis=0)
    X_cat = ohe.fit_transform(df[int_cols + cat_cols].values)

    logging.info('min-max scaling float columns')
    scaler = MinMaxScaler()
    X_num = scaler.fit_transform(df[float_cols].values)

    X = sparse.hstack((X_num, X_cat)).tocsr()

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(range(X.shape[1])):
            if i < X_num.shape[1]:
                f.write('{}\t{}\tq\n'.format(i, col))
            else:
                f.write('{}\t{}\ti\n'.format(i, col))

    logging.info('saving features')
    save_data(X[:n_trn], y, train_feature_file)
    save_data(X[n_trn:], None, test_feature_file)
Exemplo n.º 29
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    y = trn[TARGET_COL].values
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)

    # Fill Null in Age using title
    trn['Initial'] = trn.Name.str.extract('([A-Za-z]+)\.')
    tst['Initial'] = tst.Name.str.extract('([A-Za-z]+)\.')

    trn['Initial'].replace([
        'Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer',
        'Col', 'Rev', 'Capt', 'Sir', 'Don', 'Dona'
    ], [
        'Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other',
        'Other', 'Mr', 'Mr', 'Mr', 'Mr'
    ],
                           inplace=True)

    trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Mr'), 'Age'] = 33
    trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Mrs'), 'Age'] = 36
    trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Master'), 'Age'] = 5
    trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Miss'), 'Age'] = 22
    trn.loc[(trn.Age.isnull()) & (trn.Initial == 'Other'), 'Age'] = 46

    tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Mr'), 'Age'] = 33
    tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Mrs'), 'Age'] = 36
    tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Master'), 'Age'] = 5
    tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Miss'), 'Age'] = 22
    tst.loc[(tst.Age.isnull()) & (tst.Initial == 'Other'), 'Age'] = 46

    # Fill Null in Embarked
    trn['Embarked'].fillna('S', inplace=True)

    # add Age_band feature
    def categorize_age(x):
        if x <= 16:
            return 0
        elif x > 16 and x <= 32:
            return 1
        elif x > 32 and x <= 48:
            return 2
        elif x > 48 and x <= 64:
            return 3
        else:
            return 4

    trn['Age_band'] = trn.Age.apply(categorize_age)
    tst['Age_band'] = tst.Age.apply(categorize_age)

    # add Family_Size and Alone
    trn['Family_Size'] = 0
    trn['Family_Size'] = trn['SibSp'] + trn['Parch']
    trn['Alone'] = 0
    trn.loc[trn.Family_Size == 0, 'Alone'] = 1

    tst['Family_Size'] = 0
    tst['Family_Size'] = tst['SibSp'] + tst['Parch']
    tst['Alone'] = 0
    tst.loc[tst.Family_Size == 0, 'Alone'] = 1

    # add Fare_cat
    def categorize_fare(x):
        if x <= 7.91:
            return 0
        elif x > 7.91 and x <= 14.454:
            return 1
        elif x > 14.454 and x <= 31:
            return 2
        else:
            return 3

    trn['Fare_cat'] = trn.Fare.apply(categorize_fare)
    tst['Fare_cat'] = tst.Fare.apply(categorize_fare)

    # Change Initial, Embarked and Sex (string to numerical)
    trn.Initial = trn.Initial.map({
        'Master': 0,
        'Miss': 1,
        'Mr': 2,
        'Mrs': 3,
        'Other': 4
    })
    tst.Initial = tst.Initial.map({
        'Master': 0,
        'Miss': 1,
        'Mr': 2,
        'Mrs': 3,
        'Other': 4
    })

    trn.Embarked = trn.Embarked.map({'C': 0, 'Q': 1, 'S': 2})
    tst.Embarked = tst.Embarked.map({'C': 0, 'Q': 1, 'S': 2})

    trn.Sex = trn.Sex.map({'female': 0, 'male': 1})
    tst.Sex = tst.Sex.map({'female': 0, 'male': 1})

    # drop redundant features
    trn.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)
    tst.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)

    # concat trn and tst
    df = pd.concat([trn, tst], axis=0)
    df.fillna(-1, inplace=True)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write(f'{col}\n')

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y, train_feature_file)
    save_data(df.values[n_trn:, ], None, test_feature_file)
Exemplo n.º 30
0
def generate_feature(train_file, test_file,
        train_feature_file, target_label_file, test_feature_file, feature_map_file):
    logging.info('loading raw data')
    
    # 대회 데이터 로드 및 타겟 값 로드 
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    # 이상치 제거
    # test의 MinMax 범위 넘는 행은 train에서 제거
    n_trn = trn.shape[0]
    for col in trn.columns[:18]:
        trn = trn.loc[np.logical_and(trn[col] >= tst[col].min(),
                                    trn[col] <= tst[col].max())]

    logging.info(f'Number of rows removed :{n_trn - trn.shape[0]}')
    n_trn = trn.shape[0];y = trn[TARGET_COL].values

    # 데이터셋을 합쳐서 한꺼번에 가공
    trn.drop(TARGET_COL, axis=1, inplace=True)
    dataset = pd.concat([trn,tst], axis=0)
    dataset.fillna(-1, inplace=True)

    # 새로운 feature에 만들때 사용할 필드 선택
    wave_columns = dataset.columns.drop(['nObserve', 'nDetect', 'redshift'])

    # 선택한 필드들을 가지고 앞뒤 간의 차를 이용해서 새로운 변수 생성
    for j in range(14):
        name = 'diff_' + str(wave_columns[j+1]) + '_' + str(wave_columns[j])
        dataset[name] = dataset[wave_columns[j+1]] - dataset[wave_columns[j]]
        logging.info(f'{wave_columns[j+1]} - {wave_columns[j]} {j}')

    # 선택한 필드들을 가지고 15포인트 랭킹 변수 생성
    mag_rank = dataset[wave_columns].rank(axis=1)

    rank_col = []
    for col in trn[wave_columns].columns:
        col = col + '_rank'
        rank_col.append(col)
    mag_rank.columns = rank_col

    dataset = pd.concat([dataset, mag_rank], axis=1)

    # 선택한 필드들을 가지고
    # 측정방법별 파장 차이 비교 변수 생성
    diff_col = []
    for col in ['u','g','r','i','z']:
        for i in range(2):
            diff_col.append(col + '_' + str(i))

    mag_wave = pd.DataFrame(np.zeros((dataset.shape[0],10)), index=dataset.index)

    for i in range(0,10,5):
        for j in range(5):
            mag_wave.loc[:, j+i] = dataset[wave_columns[j]] - dataset[wave_columns[5+j+i]]
            logging.info(f'{wave_columns[j]} - {wave_columns[5+j+i]} {i+j}')

    # 새롭게 만든 변수들을 대회 데이터체 추가
    mag_wave.columns = diff_col
    dataset = pd.concat([dataset, mag_wave], axis=1)

    # 멱함수 분포를 정규 분포를 만들기 위해서, np.log1p를 사용
    # 그리고 nObserve 와 nDetect 차를 새로운 변수로 생성
    dataset['nObserve'] = dataset['nObserve'].apply(np.log1p)
    dataset['d_obs_det'] = dataset['nObserve'] - dataset['nDetect']

    # permutation importance를 사용해서, 사용할 필드 선택
    drop_columns = ['d_obs_det','g_0','diff_airmass_z_airmass_i','u','airmass_g','airmass_z','nDetect','dered_i_rank','diff_airmass_r_airmass_g','dered_r_rank','dered_g_rank','g_rank','airmass_i_rank','airmass_r_rank','airmass_g_rank','airmass_z_rank','dered_u_rank','r_rank','diff_airmass_u_dered_z','u_rank','z_rank','dered_z_rank','airmass_u_rank','diff_airmass_i_airmass_r','i_rank','airmass_r','z']

    # 필요없는 필드 제거
    dataset = dataset.drop(drop_columns, axis=1).copy()

    # 만들어진 변수들을 저장
    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(dataset.columns):
            f.write(f'{col}\n')

    logging.info('saving features')
    save_data(dataset.values[:n_trn,:], y, train_feature_file)
    save_data(dataset.values[n_trn:,:], None,test_feature_file)

    logging.info('saving target label')
    np.savetxt(target_label_file, y, fmt='%d', delimiter=',')