def ps_car_13_x_ps_reg_03(train_df, test_df):
    log_info('ps_car_13_x_ps_reg_03')

    train_df['made_ps_car_13_x_ps_reg_03'] = train_df['ps_car_13'] * train_df['ps_reg_03']
    test_df['made_ps_car_13_x_ps_reg_03'] = test_df['ps_car_13'] * test_df['ps_reg_03']

    return train_df, test_df
def count_encoding(train_df, test_df, replace=True):
    log_info('Count encoding')

    cols = [c for c in train_df.columns if c not in ['id', 'target']]
    train_test_df = pd.concat([train_df[cols], test_df[cols]])

    cat_cols = []  # category val
    for s in cols:
        if 'cat' in s and not 'made' in s:
            cat_cols.append(s)

    for cat_col in cat_cols:
        cats = train_test_df[cat_col].unique()
        for cat in cats:
            if replace:
                train_df[cat_col] = train_df[cat_col].replace(
                    cat,
                    np.sum(train_test_df[cat_col] == cat) / len(train_test_df))
                test_df[cat_col] = test_df[cat_col].replace(
                    cat,
                    np.sum(train_test_df[cat_col] == cat) / len(train_test_df))
            else:
                train_df['made_count_' + cat_col] = train_df[cat_col].replace(
                    cat,
                    np.sum(train_test_df[cat_col] == cat) / len(train_test_df))
                test_df['made_count_' + cat_col] = test_df[cat_col].replace(
                    cat,
                    np.sum(train_test_df[cat_col] == cat) / len(train_test_df))

    return train_df, test_df
def all_feature_pca(train_df, test_df, n_components=5):
    log_info('all_feature_pca')
    log_info('n_components=%d'%n_components)

    tmp = [c for c in train_df.columns if c not in ['id','target']]
    cols = []
    for s in tmp:
        if not 'made' in s:
            cols.append(s)
    train_test_df = pd.concat([train_df[cols], test_df[cols]])

    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    pca = PCA(n_components=n_components, random_state=41)

    X_train_test = train_test_df.as_matrix()
    X_train = train_df[cols].as_matrix()
    X_test = test_df[cols].as_matrix()

    X_train_test = scaler.fit_transform(X_train_test)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    pca.fit(X_train_test)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)

    for i in range(n_components):
        train_df['made_all_feature_pca_%d'%i] = X_train[:, i]
        test_df['made_all_feature_pca_%d'%i] = X_test[:, i]

    return train_df, test_df
def over_sampling(X, y):
    log_info('over_sampling')

    smote = SMOTE(random_state=41)
    X, y = smote.fit_sample(X, y)

    return X, y
def pascal_recon_ps_reg_03(train_df, test_df):
    '''
    Reconstruction of 'ps_reg_03'
    [https://www.kaggle.com/pnagel/reconstruction-of-ps-reg-03]
    '''
    log_info('pascal_recon_ps_reg_03')

    I = np.round((40*train_df['ps_reg_03'])**2)
    I = I.astype(int)
    M = (I - 1) // 31
    F = I - 31 * M
    train_df['ps_reg_03_M'] = M
    train_df['ps_reg_03_F'] = F

    I = np.round((40*test_df['ps_reg_03'])**2)
    I = I.astype(int)
    M = (I - 1) // 31
    F = I - 31 * M
    test_df['ps_reg_03_M'] = M
    test_df['ps_reg_03_F'] = F

    train_df.loc[train_df['ps_reg_03'] == -1, ('ps_reg_03_M', 'ps_reg_03_F')] = -1
    test_df.loc[test_df['ps_reg_03'] == -1, ('ps_reg_03_M', 'ps_reg_03_F')] = -1

    return train_df, test_df
def ps_ind_06_09_bin_count_encoding(train_df, test_df):
    log_info('ps_ind_06_09_bin_count_encoding')

    tmp = [c for c in train_df.columns if c not in ['id','target']]
    cols = []
    for s in tmp:
        if not 'made' in s:
            cols.append(s)
    train_test_df = pd.concat([train_df[cols], test_df[cols]])

    X_train = train_df['ps_ind_06_bin'] * 6
    X_test = test_df['ps_ind_06_bin'] * 6
    X_train_test = train_test_df['ps_ind_06_bin'] * 6
    for i in range(7, 10):
        X_train += train_df['ps_ind_06_bin'] * i
        X_test += test_df['ps_ind_06_bin'] * i
        X_train_test += train_test_df['ps_ind_06_bin'] * i

    cats = X_train_test.unique()
    for cat in cats:
        X_train = X_train.replace(cat, np.sum(X_train_test == cat) / len(X_train_test))
        X_test = X_test.replace(cat, np.sum(X_train_test == cat) / len(X_train_test))

    train_df['made_ps_ind_06_09_bin_count'] = X_train
    test_df['made_ps_ind_06_09_bin_count'] = X_test

    return train_df, test_df
def sum_of_na(train_df, test_df):
    log_info('sum_of_na')

    tmp = [c for c in train_df.columns if c not in ['id','target']]
    cols = []
    for s in tmp:
        if not 'made' in s:
            cols.append(s)

    train_df['made_sum_of_na'] = np.sum((train_df[cols]==-1).values, axis=1)
    test_df['made_sum_of_na'] = np.sum((test_df[cols]==-1).values, axis=1)

    return train_df, test_df
def dummy_encoding(train_df, test_df):
    log_info('Dummy encoding')

    cols = [c for c in train_df.columns if c not in ['id', 'target']]
    cat_cols = []  # category val
    for s in cols:
        if 'cat' in s and not 'made' in s:
            cat_cols.append(s)

    train_df = pd.get_dummies(train_df, columns=cat_cols)
    test_df = pd.get_dummies(test_df, columns=cat_cols)

    return train_df, test_df
def drop_cat(train_df, test_df):
    log_info('drop_cat')

    cols = [c for c in train_df.columns if c not in ['id', 'target']]

    calc_cols = []
    for s in cols:
        if 'cat' in s and not 'made' in s:
            calc_cols.append(s)

    train_df = train_df.drop(calc_cols, axis=1)
    test_df = test_df.drop(calc_cols, axis=1)

    return train_df, test_df
def combine_continuous_features(train_df, test_df):
    log_info('combine_continuous_features')

    tmp = train_df.select_dtypes(include=['float64']).columns
    col_float = []
    for col in tmp:
        if not 'made' in col:
            col_float.append(col)

    for i in range(len(col_float)):
        for j in range(i+1, len(col_float)):
            train_df['made_plus_'+col_float[i]+'_'+col_float[j]] = train_df[col_float[i]] + train_df[col_float[j]]
            train_df['made_times_'+col_float[i]+'_'+col_float[j]] = train_df[col_float[i]] * train_df[col_float[j]]
            test_df['made_plus_'+col_float[i]+'_'+col_float[j]] = test_df[col_float[i]] + test_df[col_float[j]]
            test_df['made_times_'+col_float[i]+'_'+col_float[j]] = test_df[col_float[i]] * test_df[col_float[j]]

    return train_df, test_df
def fillna(train_df, test_df, all_feature=False):
    log_info('Fill NA (cat only)')

    cols = [c for c in train_df.columns if c not in ['id', 'target']]
    train_test_df = pd.concat([train_df[cols], test_df[cols]])

    cat_cols = []  # category val
    bin_cols = []  # binary val
    num_cols = []  # numeric val
    for s in cols:
        if 'cat' in s:
            cat_cols.append(s)
        elif 'bin' in s:
            bin_cols.append(s)
        else:
            num_cols.append(s)

    for cat_col in cat_cols:
        '''
        train_df[cat_bin_col] = train_df[cat_bin_col].replace(-1,
            train_test_df[cat_bin_col][train_test_df[cat_bin_col] != -1].mode()[0])
        test_df[cat_bin_col] = test_df[cat_bin_col].replace(-1,
            train_test_df[cat_bin_col][train_test_df[cat_bin_col] != -1].mode()[0])
        '''
        train_df[cat_col] = train_df[cat_col].replace(
            -1, train_test_df[cat_col].mode()[0])
        test_df[cat_col] = test_df[cat_col].replace(
            -1, train_test_df[cat_col].mode()[0])

    if all_feature:
        '''
        for bin_col in bin_cols:
            train_df[bin_col] = train_df[bin_col].replace(-1,
                train_test_df[bin_col][train_test_df[bin_col] != -1].mode()[0])
            test_df[bin_col] = test_df[bin_col].replace(-1,
                train_test_df[bin_col][train_test_df[bin_col] != -1].mode()[0])
        '''

        for num_col in num_cols:
            train_df[num_col] = train_df[num_col].replace(
                -1, train_test_df[num_col].median())
            test_df[num_col] = test_df[num_col].replace(
                -1, train_test_df[num_col].median())

    return train_df, test_df
def higher_than_mean(train_df, test_df):
    log_info('higher_than_mean')

    tmp = [c for c in train_df.columns if c not in ['id','target']]
    cols = []
    for s in tmp:
        if not 'made' in s:
            cols.append(s)

    train_test_df = pd.concat([train_df[cols], test_df[cols]])

    means = train_test_df.mean(axis=0)
    for col in cols:
        if not 'bin' in col:
            train_df['made_higher_than_mean_'+col] = (train_df[col] > means[col]).astype('int')
            test_df['made_higher_than_mean_'+col] = (test_df[col] > means[col]).astype('int')

    return train_df, test_df
def all_one_hot(train_df, test_df):
    log_info('higher_than_median')

    tmp = [c for c in train_df.columns if c not in ['id','target']]
    cols = []
    for s in tmp:
        if not 'made' in s:
            cols.append(s)

    train_test_df = pd.concat([train_df[cols], test_df[cols]])

    unique_vals = {col: list(train_test_df[col].unique()) for col in cols}
    for col in cols:
        if len(unique_vals[col]) > 2 and len(unique_vals[col]) < 7:
            for val in unique_vals[col]:
                train_df['made_all_one_hot_'+col+'_'+str(val)] = (train_df[col].values == val).astype('int')
                test_df['made_all_one_hot_'+col+'_'+str(val)] = (test_df[col].values == val).astype('int')

    return train_df, test_df
def target_encoding(train_df, test_df, replace=True):
    log_info('Target encoding')

    cols = [c for c in train_df.columns if c not in ['id', 'target']]

    cat_cols = []  # category val
    for s in cols:
        if 'cat' in s and not 'made' in s:
            cat_cols.append(s)

    for cat_col in cat_cols:
        if replace:
            train_df[cat_col], test_df[cat_col] = target_encoding_(
                train_df[cat_col], test_df[cat_col], train_df['target'])
        else:
            train_df['made_target_avg_'+cat_col], test_df['made_target_avg_'+cat_col] \
                = target_encoding_(train_df[cat_col],
                                    test_df[cat_col],
                                    train_df['target'])

    return train_df, test_df
def likelihood_encoding(train_df, test_df, fillna=False):
    log_info('Likelihood encoding')

    df_ = df.copy()

    cat_cols = []
    for s in list(df_.iloc[:, 2:].columns):
        if 'cat' in s:
            cat_cols.append(s)

    for cat_col in cat_cols:
        tmp = df_[cat_col]
        cats = tmp.unique()
        if fillna:
            if -1 in cats:
                tmp = tmp.replace(-1, tmp.mode()[0])
                cats = tmp.unique()
        for cat in cats:
            tmp = tmp.replace(
                cat, np.mean(train_df[train_df[cat_col] == cat]['target']))
        df_[cat_col] = tmp

    return df_
def high_corr_pca(train_df, test_df, n_features=5):
    log_info('high_corr_pca')
    log_info('n_features=%d'%n_features)

    tmp = [c for c in train_df.columns if c not in ['id','target']]
    cols = []
    for s in tmp:
        if not 'made' in s and not 'cat' in s:
            cols.append(s)
    train_test_df = pd.concat([train_df[cols], test_df[cols]])

    x = train_df[cols].columns.values
    y = train_df[cols].columns.values
    z = np.abs(train_test_df.corr().values)
    z[np.isnan(z)] = 0

    for i in range(len(z)):
        z[i, i] = 0

    for i in range(n_features):
        ind = np.argwhere(z == np.max(z))[0]
        col_x = x[ind[0]]
        col_y = y[ind[1]]
        corr = z[ind[0], ind[1]]

        log_info('%d,\t(%s,\t%s)\n%f'%(i+1, col_x, col_y, corr))

        scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        pca = PCA(n_components=1, random_state=41)

        X_train_test = np.vstack((train_test_df[col_x], train_test_df[col_y])).T
        X_train = np.vstack((train_df[col_x], train_df[col_y])).T
        X_test = np.vstack((test_df[col_x], test_df[col_y])).T

        X_train_test = scaler.fit_transform(X_train_test)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        pca.fit(X_train_test)
        X_train = pca.transform(X_train)[:, 0]
        X_test = pca.transform(X_test)[:, 0]

        train_df['made_high_corr_pca_' + col_x + '_' + col_y] = X_train
        test_df['made_high_corr_pca_' + col_x + '_' + col_y] = X_test

        z[ind[0], ind[1]] = 0
        z[ind[1], ind[0]] = 0

    return train_df, test_df
from package.util import init_logging, log_info


#model_name = 'l2_rank_avg_%s'%datetime.now().strftime('%m%d%H%M')
model_name = 'l2_rank_avg'
init_logging(os.path.join('log', '%s.log'%model_name))


submission_paths = [
    # Kernel
    'Froza_and_Pascal.csv.gz',
    'rgf_submit.csv.gz',
    # My model
    'l1_lgb_11182109.csv.gz',
    'l1_xgb_11230441.csv.gz'
]

log_info('l1_models:')
for submission_path in submission_paths:
    log_info('- %s'%submission_path)

submissions = [pd.read_csv(os.path.join('submission', f), index_col=0) for f in submission_paths]
submissions = pd.concat(submissions, axis=1)
submissions.columns = submission_paths

submission = pd.read_csv('input/sample_submission.csv')
submission['target'] = np.array(np.mean(submissions.rank() / submissions.shape[0], axis=1))

submission.to_csv(os.path.join('submission', '%s.csv.gz'%model_name),
    index=False, compression='gzip')
def kinetic_feature(train_df, test_df):
    '''
    Kinetic And Transforms 0.482 UP the board
    [https://www.kaggle.com/alexandrudaia/kinetic-and-transforms-0-482-up-the-board]
    '''
    log_info('kinetic_feature')

    if not os.path.exists(os.path.join('processed', 'kinetic_train.npz')):
        first_kin_names = [col for col in train_df.columns if '_ind_' in col]
        subset_ind = train_df[first_kin_names]
        kinetic_1 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_1.append(k)
        second_kin_names = [col for col in train_df.columns if '_car_' in col and col.endswith('cat')]
        subset_ind = train_df[second_kin_names]
        kinetic_2 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_2.append(k)
        third_kin_names = [col for col in train_df.columns if '_calc_' in col and not col.endswith('bin')]
        subset_ind = train_df[second_kin_names]
        kinetic_3 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_3.append(k)
        fd_kin_names = [col for col in train_df.columns if '_calc_' in col and col.endswith('bin')]
        subset_ind = train_df[fd_kin_names]
        kinetic_4 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_4.append(k)
        train_df['made_kinetic_1'] = np.array(kinetic_1)
        train_df['made_kinetic_2'] = np.array(kinetic_2)
        train_df['made_kinetic_3'] = np.array(kinetic_3)
        train_df['made_kinetic_4'] = np.array(kinetic_4)

        np.savez(os.path.join('processed', 'kinetic_train.npz'),
            kinetic_1=np.array(kinetic_1),
            kinetic_2=np.array(kinetic_2),
            kinetic_3=np.array(kinetic_3),
            kinetic_4=np.array(kinetic_4))

    else:
        kinetic_train = np.load(os.path.join('processed', 'kinetic_train.npz'))
        train_df['made_kinetic_1'] = kinetic_train['kinetic_1']
        train_df['made_kinetic_2'] = kinetic_train['kinetic_2']
        train_df['made_kinetic_3'] = kinetic_train['kinetic_3']
        train_df['made_kinetic_4'] = kinetic_train['kinetic_4']

    if not os.path.exists(os.path.join('processed', 'kinetic_test.npz')):
        first_kin_names = [col for  col in test_df.columns  if '_ind_' in col]
        subset_ind = test_df[first_kin_names]
        kinetic_1 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_1.append(k)
        second_kin_names = [col for col in test_df.columns if '_car_' in col and col.endswith('cat')]
        subset_ind = test_df[second_kin_names]
        kinetic_2 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_2.append(k)
        third_kin_names = [col for col in test_df.columns if '_calc_' in col and not col.endswith('bin')]
        subset_ind = test_df[second_kin_names]
        kinetic_3 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_3.append(k)
        fd_kin_names = [col for col in test_df.columns if '_calc_' in col and col.endswith('bin')]
        subset_ind = test_df[fd_kin_names]
        kinetic_4 = []
        for row in tqdm(range(subset_ind.shape[0])):
            row = subset_ind.iloc[row]
            k = kinetic(row)
            kinetic_4.append(k)
        test_df['made_kinetic_1']=np.array(kinetic_1)
        test_df['made_kinetic_2']=np.array(kinetic_2)
        test_df['made_kinetic_3']=np.array(kinetic_3)
        test_df['made_kinetic_4']=np.array(kinetic_4)

        np.savez(os.path.join('processed', 'kinetic_test.npz'),
            kinetic_1=np.array(kinetic_1),
            kinetic_2=np.array(kinetic_2),
            kinetic_3=np.array(kinetic_3),
            kinetic_4=np.array(kinetic_4))

    else:
        kinetic_test = np.load(os.path.join('processed', 'kinetic_test.npz'))
        test_df['made_kinetic_1'] = kinetic_test['kinetic_1']
        test_df['made_kinetic_2'] = kinetic_test['kinetic_2']
        test_df['made_kinetic_3'] = kinetic_test['kinetic_3']
        test_df['made_kinetic_4'] = kinetic_test['kinetic_4']

    return train_df, test_df