class Tabnet: def __init__(self): with open("grad_web/function/cat_dims.pkl", "rb") as fp: self.cat_dims = pickle.load(fp) fp.close() with open("grad_web/function/cat_idxs.pkl", "rb") as fp: self.cat_idxs = pickle.load(fp) fp.close() with open("grad_web/function/features.pkl", "rb") as fp: self.features = pickle.load(fp) fp.close() self.model = TabNetRegressor(n_a=64, n_d=64, cat_dims=self.cat_dims, cat_idxs=self.cat_idxs) self.model.load_model("grad_web/function/tabnet_rm_c2.zip") def predict(self, data): if type(data) == np.ndarray: return self.predict(data) elif type(data) == pd.DataFrame: # concat = list(data.columns[data.dtypes == object]) + ['hasBooking', 'hasNpay'] \ # + [c for c in data.columns if 'label_' in c] # for col in concat: # l_enc = LabelEncoder() # data[col] = data[col].fillna("Null") # data[col] = l_enc.fit_transform(data[col].values) return self.model.predict(data[self.features].values)
class TabNetBase(AI_Base): # file_path = os.getcwd() + "\\src\\AIs\\models\\TabNetv1\\" file_path = os.getcwd() + "\\" save_name = file_path + "test_model" def __init__(self, *args, **kwargs): _TPI(self, locals()) super(TabNetBase, self).__init__(*args, **kwargs) ACT = self.env.act MATCH = self.env.match_loader self.X_train, self.X_valid, self.X_test = None, None, None self.y_train, self.y_valid, self.y_test = None, None, None self.cat_idxs, self.cat_dims, self.cat_emb_dim = MATCH.get_categorical( ) self.ai = None self._scenario_tactics = None self._scenario_matches = None self._scenario_learn_from_file = list([[ 1, # [self.epochs, [ 1, # [len(MATCH), [ 1, (self.act_register_data, dict(data=MATCH.act_get(is_flat=True))), self.act_modify_data, self.act_init_ai, # self.act_load_game, self.act_run_ai_with_learn, # self.act_test ] ], ]]) self.set_mode(self.mode) def act_register_data(self, data, is_test=False): if is_test is True: _TPI(self, locals()) else: self.X_train = np.array(self.env.match_loader.train_players) self.y_train = np.array(self.env.match_loader.train_plus) self.X_valid = np.array(self.env.match_loader.valid_players) self.y_valid = np.array(self.env.match_loader.valid_plus) self.X_test = np.array(self.env.match_loader.test_players) self.y_test = np.array(self.env.match_loader.test_plus) def act_init_ai(self, is_test=False): if is_test is True: _TPI(self, locals()) else: MATCH = self.env.match_loader self.ai = TabNetRegressor(n_steps=10, input_dim=MATCH.count_cols * MATCH.count_players, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, cat_idxs=self.cat_idxs) def act_modify_data(self, is_test=False): if is_test is True: _TPI(self, locals()) else: pass def act_load_game(self, is_test=False): if is_test is True: _TPI(self, locals()) else: save = self.save_name + ".zip" if os.path.isfile(save): print("Load Network") self.ai.load_model(save) def act_test(self, is_test=False): if is_test is True: _TPI(self, locals()) else: predictions = self.ai.predict(self.X_test) y_true = self.y_test test_score = mean_squared_error(y_pred=predictions, y_true=y_true) #np.savetxt("predict.txt", predictions, delimiter=',', fmt='%d') #np.savetxt("true.txt", y_true, delimiter=',', fmt='%d') print(test_score) def act_run_ai_with_learn(self, is_test=False): if is_test is True: _TPI(self, locals()) else: self.ai.fit(X_train=self.X_train, y_train=self.y_train, X_valid=self.X_valid, y_valid=self.y_valid, max_epochs=self.epochs, patience=500, batch_size=512, drop_last=False) # self.ai.save_model(self.save_name) def act_save_model(self, is_test=False): if is_test is True: _TPI(self, locals()) else: print(self.save_name) self.ai.save_model(self.save_name)
for s in SEED: tabnet_params['seed'] = s for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)): with zipfile.ZipFile(f'TabNet_seed_{s}_fold_{fold_nb+1}.zip', 'w') as zf: zf.write( f'../input/moatabnetcorrect2/TabNet_seed_{s}_fold_{fold_nb+1}/model_params.json', arcname='model_params.json') zf.write( f'../input/moatabnetcorrect2/TabNet_seed_{s}_fold_{fold_nb+1}/network.pt', arcname='network.pt') model = TabNetRegressor() ### Predict on test ### model.load_model(f"TabNet_seed_{s}_fold_{fold_nb+1}.zip") preds_test = model.predict(X_test) test_cv_preds.append(1 / (1 + np.exp(-preds_test))) test_preds_all = np.stack(test_cv_preds) # In[115]: all_feat = [col for col in df.columns if col not in ["sig_id"]] # To obtain the same lenght of test_preds_all and submission test = pd.read_csv("../input/lish-moa/test_features.csv") sig_id = test[test["cp_type"] != "ctl_vehicle"].sig_id.reset_index(drop=True) tmp = pd.DataFrame(test_preds_all.mean(axis=0), columns=all_feat) tmp["sig_id"] = sig_id submission = pd.merge(test[["sig_id"]], tmp, on="sig_id", how="left")
def run(try_num, config): args = get_args() print('config:', config.to_dict(), flush=True) print('args:', args, flush=True) os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' model_dir = f'blending-02-tabnet-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv('../input/lish-moa/train_features.csv') train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv') dae_features = pd.read_csv(config.dae_path) test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features[:500] train_targets = train_targets[:500] dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update(dict( n_folds=3, seeds=[222], n_epochs=3, batch_size=128, )) target_columns = [col for col in train_targets.columns if col != 'sig_id'] n_targets = len(target_columns) train_features, train_targets, test_features = preprocess(config, model_dir, train_features, train_targets, test_features, dae_features) features_columns = [col for col in train_features.columns if col not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose', 'cp_type_ctl_vehicle', 'cp_type_trt_cp']] train_features = train_features[features_columns] test_features = test_features[features_columns] smooth_loss_function = SmoothBCEwLogits(smoothing=config.smoothing) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets)) test_preds = [] for seed_index, seed in enumerate(config.seeds): print(f'Train seed {seed}', flush=True) set_seed(seed) for fold_index, (train_indices, val_indices) in enumerate(kfold.split( train_targets[target_columns].values, train_targets[target_columns].values )): print(f'Train fold {fold_index + 1}', flush=True) x_train = train_features.loc[train_indices, features_columns].values y_train = train_targets.loc[train_indices, target_columns].values x_val = train_features.loc[val_indices, features_columns].values y_val = train_targets.loc[val_indices, target_columns].values weights_path = f'{model_dir}/weights-{seed}-{fold_index}.pt' tabnet_conf = dict( seed=seed, optimizer_fn=optim.Adam, scheduler_fn=optim.lr_scheduler.ReduceLROnPlateau, n_d=32, n_a=32, n_steps=1, gamma=1.3, lambda_sparse=0, momentum=0.02, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9), mask_type="entmax", verbose=10, n_independent=1, n_shared=1, ) if args.only_pred: print('Skip training', flush=True) else: model = TabNetRegressor(**tabnet_conf) model.fit( X_train=x_train, y_train=y_train, eval_set=[(x_val, y_val)], eval_name=['val'], eval_metric=['logits_ll'], max_epochs=config.n_epochs, patience=20, batch_size=1024, virtual_batch_size=32, num_workers=1, drop_last=True, loss_fn=smooth_loss_function ) model.save_model(weights_path) print('Save weights to: ', weights_path, flush=True) model = TabNetRegressor(**tabnet_conf) model.load_model(f'{weights_path}.zip') val_preds = sigmoid(model.predict(x_val)) score = mean_log_loss(y_val, val_preds, n_targets) print(f'fold_index {fold_index} - val_loss: {score:5.5f}', flush=True) oof_preds[val_indices, seed_index, :] = val_preds preds = sigmoid(model.predict(test_features.values)) test_preds.append(preds) score = mean_log_loss(train_targets[target_columns].values, oof_preds[:, seed_index, :], n_targets) print(f'Seed {seed} - val_loss: {score:5.5f}', flush=True) oof_preds = np.mean(oof_preds, axis=1) score = mean_log_loss(train_targets[target_columns].values, oof_preds, n_targets) print(f'Overall score is {score:5.5f}', flush=True) oof_pred_df = train_targets.copy() oof_pred_df.loc[:, target_columns] = oof_preds oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False) test_features = pd.read_csv('../input/lish-moa/test_features.csv') submission = create_submission(test_features, ['sig_id'] + target_columns) submission[target_columns] = np.mean(test_preds, axis=0) submission.loc[test_features['cp_type'] == 'ctl_vehicle', target_columns] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
def pred_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, seeds, nfolds, load_path, stacking=False): cfg_tabnet = Config_TabNet() test_cv_preds = [] oof_preds = [] scores = [] for seed in seeds: print('*' * 60) kfold_col = f'kfold_{seed}' print("seed: {}".format(seed)) print('*' * 60) for fold in range(nfolds): oof_preds_fold = y_train.copy() oof_preds_fold.iloc[:, :] = 0 test_cv_preds_fold = submission.copy() test_cv_preds_fold.iloc[:, :] = 0 print("FOLD: {}".format(fold + 1)) print('*' * 60) train_df = x_train[x_train[kfold_col] != fold].reset_index( drop=True) valid_df = x_train[x_train[kfold_col] == fold].reset_index( drop=True) x_val, y_val = valid_df[feature_cols].values, valid_df[ target_cols].values x_tot, y_tot = x_train[feature_cols].values, y_train[ target_cols].values # tabnet model model = TabNetRegressor() # save model path = os.path.join(load_path, f"TabNet_seed{seed}_FOLD{fold}") if os.path.exists(path + ".zip"): model.load_model(path + ".zip") else: tmppath = os.path.join("./", f"TabNet_seed{seed}_FOLD{fold}") shutil.make_archive(tmppath, "zip", path) model.load_model(tmppath + ".zip") os.remove(tmppath + ".zip") # Predict on validation preds_val = model.predict(x_val) # Apply sigmoid to the predictions preds = 1 / (1 + np.exp(-preds_val)) score = Logloss(y_val, preds) scores.append(score) print(f"TabNet, seed{seed}, FOLD{fold}, CV predict loss: {score}") print('*' * 60) # predict on the whole train set for sacking preds_tot = model.predict(x_tot) preds_tot = 1 / (1 + np.exp(-preds_tot)) oof_preds.append(preds_tot) # Predict on test preds_test = model.predict(x_test[feature_cols].values) preds_test = 1 / (1 + np.exp(-preds_test)) test_cv_preds.append(preds_test) oof_preds_all = np.stack(oof_preds) test_preds_all = np.stack(test_cv_preds) print("Averaged Best Score for CVs is: {}".format(np.mean(scores))) if not stacking: test_pred_final = test_preds_all.mean(axis=0) else: print("stacking...") num_models = len(seeds) * nfolds test_pred_final = np.zeros(test_preds_all.shape[1:]) weights = np.zeros(num_models) # stacking method oof_preds_all = np.array(oof_preds_all) oof_preds_all = np.reshape(oof_preds_all, (num_models, -1)) y_target = np.array(y_tot) y_target = np.reshape(y_target, (y_target.shape[0] * y_target.shape[1], -1)) oof_preds_all = oof_preds_all.T print(f"oof shape is {oof_preds_all.shape}") print(f"targets is {y_target.shape}") # calculate blend weights reg = LinearRegression().fit(oof_preds_all, y_target) weights = reg.coef_[0] intercept = reg.intercept_ test_pred_final[:, :] = intercept print(f"intercept is {intercept}") print(f"weights are {weights}") for idx in range(num_models): test_pred_final += test_preds_all[idx] * weights[idx] test_pred_final = np.clip(test_pred_final, 0, 1) return test_pred_final
def run_training_tabnet(train, test, trn_idx, val_idx, feature_cols, target_cols, fold, seed, filename="tabnet"): seed_everything(seed) train_ = process_data(train) test_ = process_data(test) train_df = train_.loc[trn_idx, :].reset_index(drop=True) valid_df = train_.loc[val_idx, :].reset_index(drop=True) x_train, y_train = train_df[feature_cols].values, train_df[ target_cols].values x_valid, y_valid = valid_df[feature_cols].values, valid_df[ target_cols].values model = TabNetRegressor( n_d=32, n_a=32, n_steps=1, lambda_sparse=0, cat_dims=[3, 2], cat_emb_dim=[1, 1], cat_idxs=[0, 1], optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax', # device_name=DEVICE, scheduler_params=dict(milestones=[100, 150], gamma=0.9), #) scheduler_fn=torch.optim.lr_scheduler.MultiStepLR, verbose=10, seed=seed) loss_fn = LabelSmoothing(0.001) # eval_metric = SmoothedLogLossMetric(0.001) # eval_metric_nosmoothing = SmoothedLogLossMetric(0.) oof = np.zeros((len(train), target.iloc[:, 1:].shape[1])) if IS_TRAIN: # print("isnan", np.any(np.isnan(x_train))) model.fit(X_train=x_train, y_train=y_train, eval_set=[(x_valid, y_valid)], eval_metric=[LogLossMetric, SmoothedLogLossMetric], max_epochs=200, patience=50, batch_size=1024, virtual_batch_size=128, num_workers=0, drop_last=False, loss_fn=loss_fn) model.save_model(f"{MODEL_DIR}/{NB}_{filename}_SEED{seed}_FOLD{fold}") #--------------------- PREDICTION--------------------- x_test = test_[feature_cols].values model = TabNetRegressor( n_d=32, n_a=32, n_steps=1, lambda_sparse=0, cat_dims=[3, 2], cat_emb_dim=[1, 1], cat_idxs=[0, 1], optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax', # device_name=DEVICE, scheduler_params=dict(milestones=[100, 150], gamma=0.9), #) scheduler_fn=torch.optim.lr_scheduler.MultiStepLR, verbose=10, seed=seed) model.load_model( f"{MODEL_DIR}/{NB}_{filename}_SEED{seed}_FOLD{fold}.model") valid_preds = model.predict(x_valid) valid_preds = torch.sigmoid( torch.as_tensor(valid_preds)).detach().cpu().numpy() oof[val_idx] = valid_preds predictions = model.predict(x_test) predictions = torch.sigmoid( torch.as_tensor(predictions)).detach().cpu().numpy() return oof, predictions
num_workers=1, drop_last=False, loss_fn=SmoothBCEwLogits(smoothing=5e-5)) print('-' * 60) ### Predict on validation ### preds_val = model.predict(X_val) # Apply sigmoid to the predictions preds = 1 / (1 + np.exp(-preds_val)) score = np.min(model.history["val_logits_ll"]) saving_path_name = 'TabNet_seed_' + str( tabnet_params['seed']) + '_fold_' + str(fold_nb + 1) saved_filepath = model.save_model(saving_path_name) loaded_model = TabNetRegressor() loaded_model.load_model(saved_filepath) ### Save OOF for CV ### oof_preds[val_idx] += preds_val / len(SEED) scores.append(score) ### Predict on test ### model.load_model(saved_filepath) preds_test = model.predict(X_test) test_cv_preds.append(1 / (1 + np.exp(-preds_test))) test_preds_all = np.stack(test_cv_preds) train_features = pd.read_csv('../../inputs/train_features.csv') train_targets_scored = pd.read_csv('../../inputs/train_targets_scored.csv') oof = train_features.merge(train_targets_scored, on='sig_id')
# eval_name = ["val"], # eval_metric = ["logits_ll"], # max_epochs = MAX_EPOCH, # patience = 20, # batch_size = 1024, # virtual_batch_size = 32, # num_workers = 1, # drop_last = False, # # To use binary cross entropy because this is not a regression problem # loss_fn = SmoothCrossEntropyLoss(smoothing=smoothing) # ) # !cp -r ../input/tabnet_models/{str(seed)}_{str(fold)}/* . # !zip {seed}_{fold}.zip model_params.json network.pt model.load_model(f'./TabNet_FOLD{fold+1}_SEED{seed}.zip') print('-' * 60) ### Predict on validation ### preds_val = model.predict(x_valid) # Apply sigmoid to the predictions preds = 1 / (1 + np.exp(-preds_val)) preds = np.clip(preds, p_min, p_max) oof_tmp[val_idx] += preds # score = np.min(model.history["val_logits_ll"]) ### Save OOF for CV ### oof_preds.append(preds) oof_targets.append(y_valid) # scores.append(score)
verbose=10) test_preds_tabnet = [] test_preds_nn = [] test_preds_nn_transfer_learn = [] for seed in seeds: mskf = MultilabelStratifiedKFold(n_splits=number_of_splits, random_state=seed, shuffle=True) for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train_df, targets)): print("FOLDS: ", fold_nb) model = TabNetRegressor(**tabnet_params) model.load_model("trained_models/tabnet_" + f"fold_{fold_nb}_{seed}.zip") preds_test = model.predict(X_test) preds_test = 1 / (1 + np.exp(-preds_test)) test_preds_tabnet.append(preds_test) test_ds = TabularDatasetTest(X_test) test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False) model = simple_neural_net(num_features=in_size, num_targets=out_size, hidden_size=hidden_size) if device == "cpu": model.load_state_dict( torch.load(