def apply_test_data(): data, test_data, cat_col_names, num_col_names = data_load() bsize = 2500 * 3 * 2 * 2 data_config = DataConfig(target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names, num_workers=4) trainer_config = TrainerConfig(auto_lr_find=True, batch_size=bsize, max_epochs=100, gpus=1) optimizer_config = OptimizerConfig() model_config = TabNetModelConfig(task="classification", learning_rate=1e-3 * bsize / 1024, n_d=24, n_a=24, n_steps=5, gamma=1.3) tabular_mode = TabularModel(data_config=data_config, optimizer_config=optimizer_config, model_config=model_config, trainer_config=trainer_config) for i in range(10): diri = f"Analysis/basic_tabnet_rep{i}" tabular_mode.load_from_checkpoint(dir=diri)
def test_regression( regression_data, multi_target, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="regression") if target_range: _target_range = [] for target in data_config.target: _target_range.append( ( float(train[target].min()), float(train[target].max()), ) ) model_config_params["target_range"] = _target_range model_config = TabNetModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) assert "test_mean_squared_error" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def test_ssl( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, ssl_task, aug_task, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = { "task": "ssl", "ssl_task": ssl_task, "aug_task": aug_task, } model_config = TabNetModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() with pytest.raises(AssertionError): tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test)
def test_classification( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="classification") model_config = TabNetModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) assert "test_accuracy" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def main_64(): # Generate Synthetic Data global train data, test_data, cat_col_names, num_col_names = data_load() bsize = 2500*3*2*2 # ##########Define the Configs############ data_config = DataConfig( target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names, num_workers=4 ) trainer_config = TrainerConfig( auto_lr_find=True, batch_size=bsize, max_epochs=100, gpus=1 ) optimizer_config = OptimizerConfig() model_config = TabNetModelConfig( task="classification", learning_rate=1e-3*bsize/1024, n_d=64, n_a=64, n_steps=5, gamma=1.3 ) # Training the Model # tabular_mode.fit(train=train, validation=val) # # Evaluating the Model # # #Loss and Metrics on New Data¶ # result = tabular_mode.evaluate(test) cv = StratifiedKFold(n_splits=10, shuffle=True) res_pred = [] res_test = [] for i, (train_idx, test_idx) in enumerate(cv.split(X=data, y=data.target.values)): train, test = data.iloc[train_idx], data.iloc[test_idx] train, val = train_test_split(train, random_state=42) tabular_mode = TabularModel( data_config=data_config, optimizer_config=optimizer_config, model_config=model_config, trainer_config=trainer_config ) weighted_loss = get_class_weighted_cross_entropy(train["target"].values.ravel(), mu=0.1) # Training the Model tabular_mode.fit(train=train, validation=val, max_epochs=100, loss=weighted_loss) pred_df = tabular_mode.predict(test).loc[:, ["prediction"]] res_pred.append(pred_df) tabular_mode.save_model(f"Analysis/basic_tabnet_rep{i}") pred = tabular_mode.predict(test_data) res_test.append(pred) # #New Predictions as DataFrame pred_tot = pd.concat(res_pred).sort_index() print_metrics(data['target'], pred_tot["prediction"], tag="Holdout") pred_df = pd.concat([res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1) pred_df2 = pred_df.map(lambda x: 1 if x>0.5 else 0) sample_submisson = pd.read_csv("Data/sample_submission.csv") sample_submisson["target"] = pred_df2.values sample_submisson.to_csv("Analysis/submission_2.csv", index=False) print(confusion_matrix(data['target'], pred_tot["prediction"]))