Пример #1
0
def get_train_test(feat_path_list, base=[], target='target'):
    print(base.shape)
    feature_list = utils.parallel_load_data(path_list=feat_path_list)
    df_feat = pd.concat(feature_list, axis=1)
    df_feat = pd.concat([base, df_feat], axis=1)
    train = df_feat[~df_feat[target].isnull()].reset_index(drop=True)
    test = df_feat[df_feat[target].isnull()].reset_index(drop=True)

    return train, test
Пример #2
0
def get_dataset(base, model_no):
    win_path = f'../features/4_winner/*.gz'
    #  win_path = f'../features/1_first_valid/*.gz'
    model_path_list = [
        f'../model/LB3670_70leaves_colsam0322/*.gz',
        '../model/E2_lift_set/*.gz', '../model/E3_PCA_set/*.gz',
        '../model/E4_mix_set/*.gz', '../model/LB3669LB_70leaves/*.gz'
    ][model_no]
    model_path = model_path_list[model_no]
    tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') + glob.glob(
        f'../features/0_exp/*.gz')
    #  tmp_path_list = glob.glob(f'../features/5_tmp/*.gz')
    win_path_list = glob.glob(model_path) + glob.glob(win_path) + tmp_path_list
    #  win_path_list = glob.glob(model_path) + tmp_path_list
    #  win_path_list = glob.glob(model_path) + glob.glob(win_path)
    win_path_list = glob.glob(win_path) + tmp_path_list
    #  win_path_list = glob.glob(model_path) + glob.glob(win_path) + tmp_path_list
    #========================================================================

    feature_list = utils.parallel_load_data(path_list=win_path_list)
    df_feat = pd.concat(feature_list, axis=1)
    base = pd.concat([base, df_feat], axis=1)

    train = base[~base[target].isnull()]
    test = base[base[target].isnull()]

    if debug:
        train = train.head(10000)
        test = test.head(1000)

    for col in train.columns:
        if col in ignore_list:
            continue
        train[col] = utils.impute_feature(df=train, col=col)
        test[col] = utils.impute_feature(df=test, col=col)

    return train, test
Пример #3
0
base = utils.read_df_pkl('../input/base*')
win_path_list = glob.glob(win_path)
train_path_list = []
test_path_list = []
for path in win_path_list:
    if path.count('train'):
        train_path_list.append(path)
    elif path.count('test'):
        test_path_list.append(path)

# train_path_list = sorted(train_path_list)[:20]
# test_path_list  = sorted(test_path_list)[:20]

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
train_feature_list = utils.parallel_load_data(path_list=train_path_list)
test_feature_list = utils.parallel_load_data(path_list=test_path_list)
train = pd.concat(train_feature_list, axis=1)
train = pd.concat([base_train, train], axis=1)
test = pd.concat(test_feature_list, axis=1)
test = pd.concat([base_test, test], axis=1)
train.set_index(key, inplace=True)
test.set_index(key, inplace=True)

train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)

num_list = [
    col for col in train.columns
    if (str(train[col].dtype).count('int') or str(train[col].dtype).count(
        'float')) and col != target and not (col.count('amount'))
# Ensemble 3
set3 = f'../model/E3_set/*.gz'
# Ensemble 4
set4 = f'../model/E4_set/*.gz'

set_list = [set1, set2, set3, set4]
win_path = set_list[int(sys.argv[2])]

win_path_list = glob.glob(win_path)

base = utils.read_df_pkl('../input/base_term*0*')[[
    key, target, 'first_active_month'
]]
base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
feature_list = utils.parallel_load_data(path_list=win_path_list)
df = pd.concat(feature_list, axis=1)
train = pd.concat([base_train, df.iloc[:len(base_train), :]], axis=1)
test = pd.concat(
    [base_test, df.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)

train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

if out_part == 'no_out':
    train = train[train[target] > -30]
#========================================================================

#========================================================================
# 正規化の前処理(Null埋め, inf, -infの処理)
for col in train.columns:
Пример #5
0
    def feature_matrix(self,
                       feat_key_list=[],
                       is_reduce=False,
                       feim_path='',
                       rank=50000,
                       gain=0,
                       limit=3000):
        '''
        Explain:
            feature_path_listからtrain, testを作成する。
            feature_path_listの作成がされてない時は、featuresディレクトリ配下の
            feature全てを読み込む
        Args:
            is_reduce: メモリ削減したい時
        Return:
            train, test(DF)
        '''

        if len(self.feature_path_list) == 0:
            self.get_feature_path_list(feat_key_list=feat_key_list)

        self.feature_path_list += glob.glob('../features/raw_feature/*.gz')

        train_path_list = []
        test_path_list = []
        for path in self.feature_path_list:
            filename = utils.get_filename(path)

            if filename[:3] == 'tra':
                if len(feim_path):
                    if gain:
                        select_list = self.select_feature(feim_path, gain=gain)
                    else:
                        select_list = self.select_feature(feim_path, rank=rank)
                    trn_name = filename[6:]
                    if trn_name in select_list:
                        train_path_list.append(path)
                else:
                    train_path_list.append(path)

            elif filename[:3] == 'tes':
                if len(feim_path):
                    select_list = self.select_feature(feim_path, rank)
                    tes_name = filename[5:]
                    if tes_name in select_list:
                        test_path_list.append(path)
                else:
                    test_path_list.append(path)

        #========================================================================
        # Valid Feature
        valid_list = glob.glob('../features/valid_features/*.gz')

        for path in valid_list:
            filename = utils.get_filename(path)

            if filename[:3] == 'tra':
                train_path_list.append(path)

            elif filename[:3] == 'tes':
                test_path_list.append(path)
        #========================================================================

        test_path_list = sorted(test_path_list)[:limit]
        remove_list = []
        for path in train_path_list:
            if path.replace('train_', 'test_') not in test_path_list:
                remove_list.append(path)
        train_path_list = list(set(train_path_list) - set(remove_list))

        train_list = utils.parallel_load_data(path_list=train_path_list)
        test_list = utils.parallel_load_data(path_list=test_path_list)
        train = pd.concat(train_list, axis=1)
        test = pd.concat(test_list, axis=1)

        train = pd.concat([self.base_train, train], axis=1)
        test = pd.concat([self.base_test, test], axis=1)

        if is_reduce:
            train = utils.reduce_mem_usage(train)
            test = utils.reduce_mem_usage(test)

        return train, test