Пример #1
0
def corr_check(df=[]):

    corr = df.corr(method='pearson')
    print(corr)
    sys.exit()
    #  corr = corr.sort_index(axis=1)
    #  corr = corr.unstack().reset_index().rename(columns={'level_0': 'feature', 'level_1':'feature_2', 0:'corr'})

    importance = pd.read_csv(
        '../output/cv_feature1099_importances_auc_0.8072030486159842.csv')[[
            'feature', 'rank'
        ]]
    df = importance.query("rank<=200")

    #  df = corr.merge(importance, on='feature', how='inner')
    #  df = df.query("rank<=200")
    #  importance.rename(columns={'feature':'feature_2', 'rank':'rank_2'}, inplace=True)
    #  df = df.merge(importance, on='feature_2', how='inner')
    #  df['corr'] = np.abs(df['corr'])

    feature_list = df['feature'].drop_duplicates().values

    base = pd.read_csv('../data/base.csv')
    path_list = glob.glob('../features/3_winner/*.npy')

    for i in range(20):
        start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
        #  tmp_feature_list = df.query(f'''feature=="{feat}"''')['feature_2'].values
        seed = np.random.randint(0, 100000) + 605
        np.random.seed(seed=seed)
        emb_list = np.random.choice(feature_list, 10, replace=False)

        use_paths = []
        for elem in emb_list:
            for path in path_list:
                if path.count(elem):
                    use_paths.append(path)
                    break

        logger.info(f'SELECT PATH: {len(use_paths)}')

        data = make_feature_set(base[unique_id].to_frame(),
                                path='',
                                use_feature=use_paths).set_index(unique_id)

        for col in data.columns:
            data[col] = data[col].replace(np.inf, np.nan)
            data[col] = data[col].replace(-1 * np.inf, np.nan)
            data[col] = data[col].fillna(data[col].median())

        #  df_emb = UMAP(data=data, D=2)
        df_emb = t_SNE(data=data, D=2)
        df_emb = pd.DataFrame(data=df_emb, columns=['x', 'y'])
        df_emb[unique_id] = base[unique_id].values

        df_emb.to_csv(f'../output/{start_time}_umap_seed{seed}.csv',
                      index=False)
Пример #2
0
def check_feature_detail(path):

    base = pd.read_csv('../data/base.csv')
    df = make_feature_set(base[unique_id].to_frame(), path)

    for col in df.columns:
        if col in ignore_features:
            continue
        print(df[col].drop_duplicates().sort_values())
Пример #3
0
def make_feature_manage_table():

    app_cat_list = application_cat()
    prev_cat_list = previous_cat()
    prev_num_list = previous_num()
    prev_num = pd.Series(prev_num_list, name='prev_num').to_frame()

    table = pd.DataFrame([])
    for app_cat in app_cat_list:
        tmp_table = pd.DataFrame([])
        for prev_cat in prev_cat_list:
            prev_num['prev_cat'] = prev_cat
            if len(tmp_table) == 0:
                tmp_table = prev_num.copy()
            else:
                tmp_table = pd.concat([tmp_table, prev_num], axis=0)

        tmp_table['app_cat'] = app_cat
        if len(table) == 0:
            table = tmp_table
        else:
            table = pd.concat([table, tmp_table], axis=0)

    logger.info(f'table shape: {table.shape}')
    table['make_flg'] = 0
    base = pd.read_csv('../data/base.csv')

    ' 特徴量セットの確認 '
    path = '../features/f_previous_feature/*.npy'
    path_list = glob.glob(path)
    dataset = make_feature_set(base, path)
    dataset = dataset.set_index(unique_id)

    key_list = ['prev_cat', 'app_cat', 'prev_num']
    key_dict = {}
    key_dict = check_loop(key_dict)

    for col in dataset.columns:
        for app_cat in app_cat_list:
            for prev_cat in prev_cat_list:
                for prev_num in prev_num_list:

                    if col.count(app_cat) and col.count(
                            prev_cat) and col.count(prev_num):
                        key = f'{app_cat}_{prev_cat}_{prev_num}'
                        if key_dict[key] == 1:
                            continue
                        elif key_dict[key] == 0:
                            tmp = table.query(f"app_cat=='{app_cat}'").query(
                                f"prev_cat=='{prev_cat}'").query(
                                    f"prev_num=='{prev_num}'")
                            tmp['make_flg'] = 1
                            tmp_2 = table[key_list].merge(tmp,
                                                          on=key_list,
                                                          how='left')
                            tmp_2.fillna(0, inplace=True)
                            table['make_flg'] += tmp_2['make_flg'].values
                            logger.info(col)
                            logger.info(table.columns)

    table.to_csv(f'../output/{start_time[:12]}prev_cat_num_table.csv',
                 index=False)
Пример #4
0
def main():

    #  data = pd.read_csv('../data/FULL_OLD_BURO_MMM.csv')
    #  path = '../features/3_winner/*.npy'
    path = '../features/1_third_valid/*.npy'
    path = '../features/history/*.npy'
    base = pd.read_csv('../data/base.csv')
    data = make_feature_set(base[unique_id].to_frame(), path)
    #  data = make_feature_set(base['is_train'].to_frame(), path)
    #  data = make_feature_set(base[[unique_id, target]], path)
    data = make_feature_set(
        base[[unique_id, target, 'is_train', 'is_test', 'valid_no_4']], path)
    logger.info(data.shape)

    #  for col in data.columns:
    #      logger.info(f'\n{col}: {len(data[col][data[col]==np.inf])}')
    ' 特徴量セットの正規化verを作成する(NN / LR / EXT向け) '
    data = data_regulize(df=data,
                         na_flg=1,
                         inf_flg=1,
                         mm_flg=1,
                         float16_flg=1,
                         ignore_feature_list=ignore_features,
                         logger=logger)
    data.to_csv('../data/regular_no_app_2.csv', index=False)
    logger.info(data.shape)
    #  logger.info(data.head())

    for col in data.columns:
        logger.info(data[col].drop_duplicates().sort_values)
    data.to_csv('../data/nn_history.csv', index=False)
    sys.exit()

    ' 正規化 '

    ' infのreplace '

    ' NaN埋め '

    #  logger.info(f'\n{col}: {len(data[col][data[col]==np.inf])}')

    #  check_feature_detail(path)
    #  sys.exit()

    ' 各データ名とそのカラム名をテーブルにする '
    #  make_data_columns_table()
    #  sys.exit()

    ' 特徴量セットの構成を検証する→作ったテーブルに使用回数をカウント '
    #  dcols = pd.read_csv('../data/data_columns_table.csv')
    #  check_feature_elems(dcols, path)
    #  sys.exit()

    #  make_feature_manage_table()
    #  make_individual_feature_set()
    pred_1 = pd.read_csv(
        '../submit/20180825_204_submit_lgb_rate0.02_1099features_CV0.8082070133827914_LB0.806_early150_iter20000_regular_dima_params.csv'
    ).set_index(unique_id).rename(columns={target: f'{target}_cv8082'})
    pred_2 = pd.read_csv(
        '../submit/20180827_072_submit_lgb_rate0.02_1099features_CV0.80606353200866_LB_early150_iter20000_dart.csv'
    ).set_index(unique_id).rename(columns={target: f'{target}_cv8060_dart'})
    pred_3 = pd.read_csv(
        '../submit/20180825_224_submit_lgb_rate0.02_1099features_CV0.8072030486159842_LB0.808_early150_iter20000_no_regular_dima_params.csv'
    ).set_index(unique_id).rename(columns={target: f'{target}_cv8072'})
    pred = pred_1.join(pred_2).join(pred_3)
    corr_check(df=pred)
Пример #5
0
def check_feature_elems(dcols, path):

    prefix_list = ['a_', 'b_', 'ccb_', 'p_', 'is_', 'pos_', 'abp_', 'ap']
    data_list = ['app', 'bureau', 'prev', 'ccb', 'pos', 'is']
    base = pd.read_csv('../data/base.csv')
    df = make_feature_set(base[unique_id].to_frame(),
                          path).set_index(unique_id)
    feature_arr = df.columns

    dcols.sort_values(by=['dname', 'length'], ascending=False, inplace=True)

    ' 各カラムの使用数をカウントする為,辞書でもつ'
    col_dict = {}
    for dname in dcols['dname'].drop_duplicates():
        col_dict[dname] = dname
        tmp = {}
        for col in dcols.query(f"dname=='{dname}'")['column']:
            tmp[col] = 0
            col_dict[dname] = tmp

    for f in feature_arr:
        if f[:2] == 'a_' or f[:4] == 'abp_' or f[:3] == 'ap_':
            dcolumns = dcols.query("dname=='app'")['column'].values
            dname = 'app'
        elif f[:2] == 'b_':
            dcolumns = dcols.query("dname=='bureau'")['column'].values
            dname = 'bureau'
        elif f[:2] == 'p_':
            dcolumns = dcols.query("dname=='prev'")['column'].values
            dname = 'prev'
        elif f[:4] == 'ccb_':
            dcolumns = dcols.query("dname=='ccb'")['column'].values
            dname = 'ccb'
        elif f[:4] == 'pos_':
            dcolumns = dcols.query("dname=='pos'")['column'].values
            dname = 'pos'
        elif f[:3] == 'is_':
            dcolumns = dcols.query("dname=='is'")['column'].values
            dname = 'is'

        cnt_col_list = []
        ' まずは元データのカラムをチェック '
        for col in dcolumns:
            if f.count(col):
                logger.info(f'f:{f} dname:{dname} col:{col}')
                col_dict[dname][col] += 1
                cnt_col_list.append(col)

        ' 残りのデータのカラムをチェック '
        tmp_data_list = data_list.copy()
        tmp_data_list.remove(dname)
        for dname in tmp_data_list:
            for col in dcols.query(f"dname=='{dname}'")['column'].values:
                ' 同じカラム名は二度カウントさせない '
                if col in cnt_col_list:
                    continue
                if f.count(col):
                    logger.info(f'f:{f} dname:{dname} col:{col}')
                    col_dict[dname][col] += 1
                    cnt_col_list.append(col)

    result = pd.DataFrame(col_dict).T.stack().reset_index().rename(columns={
        'level_0': 'dname',
        'level_1': 'feature',
        0: 'cnt'
    })

    result.to_csv(f'../eda/{start_time[:11]}_feature_set_elems.csv',
                  index=False)
Пример #6
0
def main():

    prefix = ''
    level = [unique_id]
    base = pd.read_csv('../data/base.csv')

    ' 学習に使うfeature_setをmerge '
    path = '../features/3_winner/*.npy'
    dataset = make_feature_set(base, path)
    dataset_columns = list(dataset.columns)
    #  dataset.set_index(level, inplace=True)

    #  logger.info(f'\nconcat end\ndataset shape: {dataset.shape}')

    ' imputeする連続値のカラムリスト '
    value_path_list = glob.glob('../features/*.csv')

    ' 特徴量を欠損値補完してnpyに保存する '
    score_list = []
    impute_list = []
    null_len_list = []

    ' 作成済featureが格納されたパス(同じfeatureは除くため) '
    path_list = glob.glob('../features/1_first_valid/*.npy')
    extract_list = []
    for path in path_list:
        if path.count('impute'):

            ' _imputeを除いたfeature_name '
            filename = re.search(r'/([^/.]*).npy', path).group(1)[:-7]
            extract_list.append(filename)

    for value_path in value_path_list:
        if value_path.count('npy'):
            value = re.search(r'/([^/.]*).npy', value_path).group(1)
        elif value_path.count('csv'):
            value = re.search(r'/([^/.]*).csv', value_path).group(1)

        ' 既に作成済のfeatureは作らない '
        if value in extract_list:
            logger.info(f'{value} is already exist.')
            continue

        ' データセットに含まれない特徴量は追加する '
        if value not in dataset_columns:
            if value_path.count('npy'):
                dataset[value] = np.load(value_path)
            elif value_path.count('csv'):
                base = pd.read_csv('../data/base.csv')
                tmp = pd.read_csv(value_path)
                dataset[value] = base[unique_id].to_frame().merge(
                    tmp, on=unique_id, how='left')['TARGET']

        null_len = len(dataset[value]) - len(dataset[value].dropna())
        cv_score = impute_regression(base, level, dataset, value, prefix)

        if value not in dataset_columns:
            dataset.drop(value, axis=1, inplace=True)
        impute_list.append(value)
        score_list.append(cv_score)
        null_len_list.append(null_len)

        if len(impute_list) > 2:
            result = pd.Series(impute_list, name='feature').to_frame()
            result['r2_score'] = score_list
            result['null_len'] = null_len_list
            result.to_csv(
                f'../output/{start_time[:11]}_impute_reg_feature_score.csv',
                index=False)