def build_tabnet(): model_file_name = 'tabnet_model_{}'.format(current_dataset_name) df = current_dataset.copy() cleaning_text(df) X = df['clean_content'] y = df['emotion'] # tokenize la data tok = Tokenizer(num_words=1000, oov_token='<UNK>') # fit le model avec les données de train # tok.fit_on_texts(X) # X = tok.texts_to_matrix(X, mode='tfidf') # split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=1) X_test_save = X_test tok.fit_on_texts(X_test) X_test = tok.texts_to_matrix(X_test, mode='tfidf') tok.fit_on_texts(X_train) X_train = tok.texts_to_matrix(X_train, mode='tfidf') # X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y) # build model, fit and predict if LOAD_MODEL and pathlib.Path(model_file_name).exists(): model = pickle.load(open(model_file_name, 'rb')) else: model = TabNetClassifier() model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_name=['train', 'valid'], eval_metric=['accuracy', 'balanced_accuracy', 'logloss']) preds_mapper = { idx: class_name for idx, class_name in enumerate(model.classes_) } preds = model.predict_proba(X_test) y_pred_proba = np.vectorize(preds_mapper.get)(np.argmax(preds, axis=1)) y_pred = model.predict(X_test) test_acc = accuracy_score(y_pred=y_pred, y_true=y_test) pickle.dump(model, open(model_file_name, 'wb')) # model.save_model(model_file_name) return model, y_test, y_pred, test_acc
def fit(self, x_train, y_train, kf_splits=5, tabnet_type=None): def _get_tabnet_params(tabnet_type): if (tabnet_type is None): tabnet_params = dict( verbose=40, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-2, weight_decay=1e-5), scheduler_params=dict(max_lr=0.05, steps_per_epoch=x_train.shape[0] // 128, epochs=300), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR) fit_params = dict(batch_size=1024, virtual_batch_size=128, eval_metric='accuracy') elif (tabnet_type == 'TabNet-S'): tabnet_params = dict( n_d=8, n_a=8, lambda_sparse=0.0001, momentum=0.1, n_steps=3, gamma=1.2, verbose=40, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=0.01), scheduler_params=dict(step_size=8000, gamma=0.05), scheduler_fn=torch.optim.lr_scheduler.StepLR) fit_params = dict(batch_size=4096, virtual_batch_size=256, eval_metric='mse') else: print('[ERROR] Unknown tabnet_type: {}'.format(tabnet_type)) quit() # --- check problem --- if fit_params['eval_metric'] in [ 'auc', 'accuracy', 'balanced_accuracy', 'logloss' ]: problem = 'classification' elif fit_params['eval_metric'] in ['mse', 'mae', 'rmse', 'rmsle']: problem = 'regression' return tabnet_params, fit_params, problem kf = KFold(n_splits=kf_splits, shuffle=False) scores = [] self.tabnet_models = [] tabnet_params, fit_params, problem = _get_tabnet_params(tabnet_type) for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): if (problem == 'classification'): unsupervised_model = TabNetPretrainer(**tabnet_params) tabnet_model = TabNetClassifier(**tabnet_params) elif (problem == 'regression'): unsupervised_model = TabNetPretrainer(**tabnet_params) tabnet_model = TabNetRegressor(**tabnet_params) else: pring('[ERROR] Unknown problem: {}'.format(problem)) quit() x_tr = x_train[train_index] x_val = x_train[val_index] y_tr = y_train[train_index] y_val = y_train[val_index] unsupervised_model.fit(x_tr, eval_set=[x_val], patience=300, max_epochs=5000, pretraining_ratio=0.8) tabnet_model.fit( x_tr, y_tr, eval_set=[(x_val, y_val)], eval_metric=[fit_params['eval_metric']], batch_size=fit_params['batch_size'], virtual_batch_size=fit_params['virtual_batch_size'], patience=300, max_epochs=5000, from_unsupervised=unsupervised_model) self.tabnet_models.append(tabnet_model) prediction = tabnet_model.predict(x_val) if (problem == 'classification'): scores.append(accuracy_score(y_val, prediction)) elif (problem == 'regression'): scores.append(mean_squared_error(y_val, prediction)) else: pring('[ERROR] Unknown problem: {}'.format(problem)) quit() if (i == 0): feature_importances = tabnet_model.feature_importances_.copy() else: feature_importances = np.vstack( (feature_importances, tabnet_model.feature_importances_)) print(scores) print(np.mean(scores)) return scores, feature_importances