'verbose': -1, # 'seed': SEED } np.random.seed(SEED) loader = utils_best.Loader('LB804') # ============================================================================= # load # ============================================================================= # train X_train = loader.train() y_train = utils.read_pickles('../data/label').TARGET files_tr = utils.get_use_files(new_features, True) X_ = pd.concat([pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) X_train = pd.concat([X_train, X_], axis=1) if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category()))
'bagging_freq': 1, 'verbose': -1, 'seed': SEED } use_files = [ 'train_f001', # 'train_f002_WEEKDAY_APPR_PROCESS_START-ORGANIZATION_TYPE', # 'train_f002_OCCUPATION_TYPE-ORGANIZATION_TYPE' ] # ============================================================================= # load # ============================================================================= files = utils.get_use_files(use_files, True) X = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) y = utils.read_pickles('../data/label').TARGET CAT = list(set(X.columns) & set(utils_cat.ALL)) if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') gc.collect() # =============================================================================
# 'max_leaves': 0, 'max_bin': 256, # 'predictor': 'cpu_predictor', 'objective': 'binary:logistic', 'eval_metric': 'auc', # 'seed': SEED } use_files = [] np.random.seed(SEED) # ============================================================================= # load train, test # ============================================================================= files = utils.get_use_files(use_files, True) X_train = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) y = utils.read_pickles('../data/label').TARGET # maxwell maxwell = pd.read_feather('../feature_someone/Maxwell_train.f') X_train = pd.concat([X_train, maxwell], axis=1) del maxwell gc.collect() if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ')
'bagging_freq': 1, 'verbose': -1, 'seed': SEED } np.random.seed(SEED) # ============================================================================= # load # ============================================================================= prefixes = [ 'f001', 'f002', ] files = utils.get_use_files(prefixes, True) X_train = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) y_train = X_train[label_name] X_train.drop(label_name, axis=1, inplace=True) CAT = list(set(X_train.columns) & set(utils_cat.ALL)) if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect()
'nthread': cpu_count(), 'bagging_freq': 1, 'verbose': -1, 'seed': SEED } # ============================================================================= # load # ============================================================================= X = pd.read_csv( '../data/Valid_stochastic_blending_v3-2_valid_0.308rk_0.444mw1_0.248t1_0.81050CV_0.3Adv.csv' ) X.drop('SK_ID_CURR', axis=1) y = utils.read_pickles('../data/label').TARGET files = utils.get_use_files(new_features) #for new_feature in new_features: # files += glob(f'../feature/train_{new_feature}*') print('files:', len(files)) X_ = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) X = pd.concat([X, X_], axis=1) del X_ if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}')
'nthread': cpu_count(), 'bagging_freq': 1, 'verbose':-1, 'seed': SEED } loader = utils_best.Loader('LB804') # ============================================================================= # load # ============================================================================= X_old = loader.train() y = utils.read_pickles('../data/label').TARGET files_tr = utils.get_use_files(new_features, True) X_ = pd.concat([pd.read_feather(f) for f in tqdm(files_tr, mininterval=60) ], axis=1) X_new = pd.concat([X_old, X_], axis=1).drop(['f001_EXT_SOURCE_1', 'f001_EXT_SOURCE_2','f001_EXT_SOURCE_3'], axis=1) del X_ if X_new.columns.duplicated().sum()>0: raise Exception(f'duplicated!: { X_new.columns[X_new.columns.duplicated()] }') print('no dup :) ') print(f'X_new.shape {X_new.shape}') gc.collect() CAT = list( set(X_new.columns) & set(loader.category()) )
@author: Kazuki """ import numpy as np import pandas as pd from tqdm import tqdm import gc, os from multiprocessing import Pool, cpu_count NTHREAD = cpu_count() #import utils_agg import utils #utils.start(__file__) #============================================================================== files = utils.get_use_files([], True) X = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) # ============================================================================= # var0 # ============================================================================= col_var0 = utils.check_var(X) def multi_touch_var0(arg): os.system(f'touch "../feature_var0/{arg}.f"') pool = Pool(cpu_count()) pool.map(multi_touch_var0, col_var0)