예제 #1
0
def two_calicurate(df):
    '''
    CALICULATION
    複数カラムを四則演算し新たな特徴を作成する
    '''
    f1_list = get_numeric_features(df=df, ignore=ignore_list)
    f2_list = get_numeric_features(df=df, ignore=ignore_list)
    used_lsit = []
    for f1 in f1_list:
        for f2 in f2_list:
            ' 同じ組み合わせの特徴を計算しない '
            if f1 == f2:
                continue
            if sorted([f1, f2]) in used_list:
                continue
            used_list.append(sorted([f1, f2]))

            ' For home-credit'
            if (not (f1.count('revo'))
                    and f2.count('revo')) or (f1.count('revo')
                                              and not (f2.count('revo'))):
                continue

            if diff:
                df = diff_feature(df=df, first=f1, second=f2)
            if div:
                df = division_feature(df=df, first=f1, second=f2)
            if pro:
                df = product_feature(df=df, first=f1, second=f2)

    use_cols = []
    for col in df.columns:
        if col.count('_div_') or col.count('_diff_') or col.count('_pro_'):
            use_cols.append(col)
    df = df[[key] + use_cols]
    return df
예제 #2
0
def make_num_features(df, filekey):
    mkdir_func(f'../features/{filekey}')

    #  if filekey.count('bur'):
    df = interact_feature(df, filekey)

    #========================================================================
    # カテゴリの内容別にNumeric Featureを切り出す
    #========================================================================
    num_list = get_numeric_features(df=df, ignore_list=ignore_list)
    cat_list = get_categorical_features(df=df, ignore_list=[])

    #  few_list = []
    #  for cat in tqdm(cat_list):
    #      for val in tqdm(df[cat].drop_duplicates()):
    #          length = len(df[df[cat]==val])
    #          if length < len(df)*0.002:
    #              few_list.append(val)
    #              continue
    #          for num in num_list:
    #          #  pararell_process(, num_list)
    #              df[f'{num}_{cat}-{val}@'] = df[num].where(df[cat]==val, np.nan)
    #              df[f'{num}_{cat}-fewlist@'] = df[num].where(df[cat].isin(few_list), np.nan)

    logger.info(f'{fname} SET SHAPE : {df.shape}')

    #========================================================================
    # Feature Save & Categorical Encoding & Feature Save 
    #========================================================================
    train = df[~df[target].isnull()]
    test = df[df[target].isnull()]

    categorical_features = get_categorical_features(df=train, ignore_list=ignore_list)

    #========================================================================
    # Numeric Feature Save
    #========================================================================
    for col in train.columns:
        if col in categorical_features:continue
        result_train = train[col].values
        result_test = test[col].values
        logger.info(f"COL: {col} | LENGTH: {len(result_train)}")
        utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}')
        if col != target:
            utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}')
예제 #3
0
def one_base_agg(df, prefix):
    # =======================================================================
    # 集計するカラムリストを用意
    # =======================================================================
    num_list = get_numeric_features(df=df, ignore=ignore_list)

    # 並列処理→DFが重いと回らないかも
    #  arg_list = []
    #  for num in num_list:
    #      for method in method_list:
    #          tmp = df[[key, num]]
    #          arg_list.append([tmp, key, num, method, prefix, '', base])

    #  ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする '
    #  call_list = pararell_process(base_agg_wrapper, arg_list)
    #  result = pd.concat(call_list, axis=1)

    #  for col in result.columns:
    #      if not(col.count('@')) or col in ignore_list:
    #          continue
    #      #  utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values)
    #  sys.exit()

    # 直列処理
    for num in num_list:
        for method in method_list:
            tmp = df[[key, num]]
            tmp_result = base_aggregation(df=tmp,
                                          level=key,
                                          method=method,
                                          prefix=prefix,
                                          feature=num)
            result = base.merge(tmp_result, on=key, how='left')
            renu = result[result[target].isnull()]
            for col in result.columns:
                if not (col.count('@')) or col in ignore_list:
                    continue
                if exclude_feature(col, result[col].values): continue
                if exclude_feature(col, renu[col].values): continue

                file_path = f"{dir}/{col}.fp"
                #  utils.to_pickle(path=file_path, obj=result[col].values)
                utils.to_pkl_gzip(obj=result[col].values, path=file_path)
            del result, renu, tmp_result
            gc.collect()
예제 #4
0
def num_cat_encoding(df, bins=0, isfill=False, origin_drop=True):
    '''
    Explain:
        Numeric to binning
    Args:
    Return:
    '''

    bin_list = get_numeric_features(df=df, ignore=ignore_features)

    logger.info(df.shape)
    for col in bin_list:
        # 必要ならNullは中央値で埋める
        if isfill:
            df[col] = df[col].replace(np.inf, np.nan)
            df[col] = df[col].replace(-1 * np.inf, np.nan)
            df[col] = df[col].fillna(df[col].median())
        # binにする数よりユニーク数が少ない場合は除外
        length = len(df[col].drop_duplicates())
        if length < bins:
            continue
        df[f'bin{bins}_{col}'] = pd.qcut(x=df[col], q=bins, duplicates='drop')
        if origin_drop:
            df.drop(col, axis=1, inplace=True)
예제 #5
0
def main():

    '''
    BASE AGGRIGATION
    単一カラムをlevelで粒度指定して基礎集計
    '''
    if agg_code == 'base':

        # =======================================================================
        # 集計するカラムリストを用意
        # =======================================================================
        num_list = get_numeric_features(df=df, ignore=ignore_list)

        # =======================================================================
        # 集計開始
        # =======================================================================
        for num in num_list:
            for method in method_list:
                arg_list.append(df, key, num, method, prefix, '', base)
        ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする '
        call_list = pararell_process(pararell_wrapper(base_aggregation), arg_list)
        result = pd.concat(call_list, axis=1)

        for col in result.columns:
            if not(col.count('@')) or col in ignore_list:
                continue
            print(col)
            #  utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values)
        sys.exit()


        #  for num in num_list:
        #      for method in method_list:
        #          tmp_result = base_aggregation(df=df, level=key, method=method, prefix=prefix, feature=num, drop=True)
        #          result = base.merge(tmp_result, on=key, how='left')
        #          for col in result.columns:
        #              if not(col.count('@')) or col in ignore_list:
        #                  continue
        #              utils.to_pickle(
        #                  path=f"{dir}/{col}.fp", obj=result[col].values)
                #  make_npy(result=result, ignore_list=ignore_features, logger=logger)

    elif agg_code == 'caliculate':

        '''
        CALICULATION
        複数カラムを四則演算し新たな特徴を作成する
        '''
        f1_list = []
        f2_list = []
        used_lsit = []
        for f1 in f1_list:
            for f2 in f2_list:
                ' 同じ組み合わせの特徴を計算しない '
                if f1 == f2:
                    continue
                if sorted([f1, f2]) in used_list:
                    continue
                used_list.append(sorted([f1, f2]))

                if diff:
                    df = diff_feature(df=df, first=f1, second=f2)
                elif div:
                    df = division_feature(df=df, first=f1, second=f2)
                elif pro:
                    df = product_feature(df=df, first=f1, second=f2)

        for col in df.columns:
            utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values)

    elif agg_code == 'cnt':
        '''
        COUNT ENCODING
        level粒度で集計し、cnt_valを重複有りでカウント
        '''
        cat_list = get_categorical_features(df=df, ignore=ignore_list)

        for category_col in cat_list:
            df = cnt_encoding(df, category_col, ignore_list)
        df = base.merge(df, on=key, how='inner')
        cnt_cols = [col for col in df.columns inf col.count('cntec')]
        for col in cnt_cols:
            utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values)

    elif agg_code == 'category':
        arg_list = []
        ' カテゴリカラム '
        cat_list = get_categorical_features(df=df, ignore=ignore_list)
        num_list = get_numeric_features(df=df, ignore=ignore_list)

        for cat in cat_list:
            for value in num_list:
                for method in method_list:
                    arg_list.append(base, df, key, cat, value,
                                    method, ignore_list, prefix)

        ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする '
        pararell_process(pararell_wrapper(select_category_value_agg), arg_list)
        #  select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method, ignore_list, prefix)

    elif agg_code == 'combi':
        combi_num = [1, 2, 3][0]
        cat_combi = list(combinations(categorical, combi_num))

    elif agg_code == 'dummie':

        ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる '
        cat_list = get_categorical_features(data, ignore_features)
        df = get_dummies(df=df, cat_list=cat_list)
예제 #6
0
    f1 = feat_combi[0]
    f2 = feat_combi[1]
    feat1 = diff_feature(df=df, first=f1, second=f2)
    feat2 = division_feature(df=df, first=f1, second=f2)
    feat3 = product_feature(df=df, first=f1, second=f2)
    feat = pd.concat([feat1, feat2, feat3], axis=1)
    return feat


if arithmetic:
    used_list = []
    '''
    CALICULATION
    複数カラムを四則演算し新たな特徴を作成する
    '''
    num_list = get_numeric_features(df=df, ignore_list=ignore_list)
    amt_list = [col for col in num_list if col.count('AMT_')]
    days_list = [
        col for col in num_list if col.count('DAYS_') or col.count('OWN')
    ]
    num_list = amt_list + days_list

    ext_list = [col for col in df.columns if col.count('EXT_')]

    f1_list = num_list
    f2_list = num_list
    #  f1_list = ext_list
    #  f2_list = ext_list
    #  f2_list = ['EXT_SOURCE_2']
    used_lsit = []
    result_feat = pd.DataFrame()
예제 #7
0
def main():

    path = f'../input/{sys.argv[1]}*'
    df = utils.read_df_pickle(path=path)
    prefix = sys.argv[2]
    '''
    BASE AGGRIGATION
    単一カラムをlevelで粒度指定して基礎集計
    '''
    if agg_code == 'base':
        one_base_agg(df=df, prefix=prefix)
    elif agg_code == 'caliculate':
        df = two_calicurate(df=df)
        if prefix != 'app_':
            one_base_agg(df=df, prefix=prefix)
        else:
            for col in df.columns:
                utils.to_pickle(path=f"{dir}/{prefix}{col}.fp",
                                obj=df[col].values)

    elif agg_code == 'cnt':
        '''
        COUNT ENCODING
        level粒度で集計し、cnt_valを重複有りでカウント
        '''
        cat_list = get_categorical_features(df=df, ignore=ignore_list)

        for category_col in cat_list:
            df = cnt_encoding(df, category_col, ignore_list)
        df = base.merge(df, on=key, how='inner')
        cnt_cols = [col for col in df.columns if col.count('cntec')]
        for col in cnt_cols:
            if exclude_feature(col, df[col].values): continue
            utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values)

    elif agg_code == 'category':

        ' カテゴリカラムの中のvalue毎に集計する '
        arg_list = []
        cat_list = get_categorical_features(df=df, ignore=ignore_list)
        num_list = get_numeric_features(df=df, ignore=ignore_list)
        for cat in cat_list:
            for value in num_list:
                for method in method_list:
                    select_category_value_agg(base,
                                              df=df,
                                              key=key,
                                              category_col=cat,
                                              value=value,
                                              method=method,
                                              ignore_list=ignore_list,
                                              prefix=prefix)
                    #  arg_list.append(base, df, key, cat, value, method, ignore_list, prefix)

        #  pararell_process(select_cat_wrapper, arg_list)

    elif agg_code == 'combi':
        combi_num = [1, 2, 3][0]
        cat_combi = list(combinations(categorical, combi_num))

    elif agg_code == 'dummie':

        ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる '
        cat_list = get_categorical_features(data, ignore_features)
        df = get_dummies(df=df, cat_list=cat_list)