def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
    deep_layers,
    batch_norm_continuous_input,
    attention_pooling,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="regression")
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append(
                    (
                        float(train[target].min()),
                        float(train[target].max()),
                    )
                )
            model_config_params["target_range"] = _target_range
        model_config_params["deep_layers"] = deep_layers
        model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input
        model_config_params["attention_pooling"] = attention_pooling
        model_config = AutoIntConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "test_mean_squared_error" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
示例#2
0
def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(
            task="regression",
            input_embed_dim=8,
            num_attn_blocks=1,
            num_heads=2,
        )
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append((
                    float(train[target].min()),
                    float(train[target].max()),
                ))
            model_config_params["target_range"] = _target_range
        model_config = TabTransformerConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=1,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "test_mean_squared_error" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
示例#3
0
def test_regression(
    regression_data,
    multi_target,
    embed_categorical,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(
            task="regression",
            depth=2,
            num_trees=50,
            embed_categorical=embed_categorical,
        )
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append((
                    train[target].min().item(),
                    train[target].max().item(),
                ))
            model_config_params["target_range"] = _target_range
        model_config = NodeConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=1,
                                       checkpoints=None,
                                       early_stopping=None,
                                       gpus=0,
                                       fast_dev_run=True)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "valid_loss" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
示例#4
0
def test2(nrows=10000):
    """
       python source/models/torch_tabular.py test

    """
    global model, session


    df,colcat, colnum, coly = test_dataset_covtype(1000)
    target_name =  coly

    df.head()
    train, test = train_test_split(df, random_state=42)
    train, val  = train_test_split(train, random_state=42)
    num_classes = len(set(train[target_name].values.ravel()))


    data_config = DataConfig(
        target=target_name,
        continuous_cols=colnum,
        categorical_cols=colcat,
        continuous_feature_transform=None,#"quantile_normal",
        normalize_continuous_features=False
    )
    model_config = CategoryEmbeddingModelConfig(task="classification",
                                                metrics=["f1","accuracy"],
                                                metrics_params=[{"num_classes":num_classes},{}])

    trainer_config = TrainerConfig(gpus=None, fast_dev_run=True)
    experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example",
                                         run_name="node_forest_cov",
                                         exp_watch="gradients",
                                         log_target="wandb",
                                         log_logits=True)
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        # experiment_config=experiment_config,
    )

    tabular_model.fit(  train=train, validation=val)
    result = tabular_model.evaluate(val)
    log(result)
    
    
    test.drop(columns=target_name, inplace=True)
    pred_df = tabular_model.predict(val.iloc[:100,:])

    log(pred_df)
示例#5
0
def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    variant,
    num_gaussian,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="regression")
        mdn_config = MixtureDensityHeadConfig(num_gaussian=num_gaussian)
        model_config_params["mdn_config"] = mdn_config
        model_config = variant(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "test_mean_squared_error" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
def test_classification(
    classification_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    deep_layers,
    batch_norm_continuous_input,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="classification")
        model_config_params["deep_layers"] = deep_layers
        model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input
        model_config = AutoIntConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "test_accuracy" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
示例#7
0
def test_classification(
    classification_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(
            task="classification",
            input_embed_dim=8,
            num_attn_blocks=1,
            num_heads=2,
        )
        model_config = TabTransformerConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=1,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "test_accuracy" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
示例#8
0
def test_classification(
    classification_data,
    continuous_cols,
    categorical_cols,
    embed_categorical,
    continuous_feature_transform,
    normalize_continuous_features,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(
            task="classification",
            depth=2,
            num_trees=50,
            embed_categorical=embed_categorical,
        )
        model_config = NodeConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=1,
                                       checkpoints=None,
                                       early_stopping=None,
                                       gpus=0)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "valid_loss" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
def main():
    # Generate Synthetic Data
    data, cat_col_names, num_col_names = make_mixed_classification(
        n_samples=10000, n_features=20, n_categories=4)
    train, test = train_test_split(data, random_state=42)
    train, val = train_test_split(train, random_state=42)

    # ##########Define the Configs############
    data_config = DataConfig(target=["target"],
                             continuous_cols=num_col_names,
                             categorical_cols=cat_col_names)
    trainer_config = TrainerConfig(auto_lr_find=True,
                                   batch_size=1024,
                                   max_epochs=100,
                                   gpus=1)
    optimizer_config = OptimizerConfig()

    model_config = CategoryEmbeddingModelConfig(task="classification",
                                                layers="1024-512-512",
                                                activation="LeakyReLU",
                                                learning_rate=1e-3)

    tabular_mode = TabularModel(data_config=data_config,
                                model_config=model_config,
                                optimizer_config=optimizer_config,
                                trainer_config=trainer_config)

    # Training the Model
    tabular_mode.fit(train=train, validation=val)
    # Evaluating the Model
    # #Loss and Metrics on New Data¶
    result = tabular_mode.evaluate(test)

    # #New Predictions as DataFrame
    pred_df = tabular_mode.predict(test)
    pred_df.head()

    print_metrics(test['target'], pred_df["prediction"], tag="Holdout")

    # saving model
    tabular_mode.save_model("Analysis/basic")
def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
    target_transform,
    custom_metrics,
    custom_loss,
    custom_optimizer,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="regression")
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append((
                    float(train[target].min()),
                    float(train[target].max()),
                ))
            model_config_params["target_range"] = _target_range
        model_config = CategoryEmbeddingModelConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(
            train=train,
            test=test,
            metrics=custom_metrics,
            target_transform=target_transform,
            loss=custom_loss,
            optimizer=custom_optimizer,
            optimizer_params={},
        )

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        if custom_metrics is None:
            assert "test_mean_squared_error" in result[0].keys()
        else:
            assert "test_fake_metric" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
示例#11
0
def main():
    # Generate Synthetic Data
    data, test_data, cat_col_names, num_col_names = data_load()

    bsize = 2500 * 2

    # ##########Define the Configs############
    data_config = DataConfig(target=["target"],
                             continuous_cols=num_col_names,
                             categorical_cols=cat_col_names,
                             num_workers=4)
    trainer_config = TrainerConfig(auto_lr_find=True,
                                   batch_size=bsize,
                                   max_epochs=100,
                                   gpus=1)
    optimizer_config = OptimizerConfig()

    # model_config = TabNetModelConfig(
    #     task="classification",
    #     learning_rate=1e-3*bsize/1024,
    #     n_d=16,
    #     n_a=16,
    #     n_steps=5,
    #     gamma=1.3
    # )

    model_config = NodeConfig(
        task="classification",
        num_layers=2,  # Number of Dense Layers
        num_trees=1024,  # Number of Trees in each layer
        depth=3,  # Depth of each Tree
        embed_categorical=True,
        # If True, will use a learned embedding, else it will use LeaveOneOutEncoding for categorical columns
        learning_rate=1e-3,
        additional_tree_output_dim=5)

    # Training the Model
    # tabular_mode.fit(train=train, validation=val)
    # # Evaluating the Model
    # # #Loss and Metrics on New Data¶
    # result = tabular_mode.evaluate(test)

    cv = StratifiedKFold(n_splits=10, shuffle=True)

    res_pred = []
    res_test = []
    for i, (train_idx,
            test_idx) in enumerate(cv.split(X=data, y=data.target.values)):
        train, test = data.iloc[train_idx], data.iloc[test_idx]
        train, val = train_test_split(train, random_state=42)

        tabular_mode = TabularModel(data_config=data_config,
                                    optimizer_config=optimizer_config,
                                    model_config=model_config,
                                    trainer_config=trainer_config)
        weighted_loss = get_class_weighted_cross_entropy(
            train["target"].values.ravel(), mu=0.1)

        # Training the Model
        tabular_mode.fit(train=train,
                         validation=val,
                         max_epochs=100,
                         loss=weighted_loss)
        pred_df = tabular_mode.predict(test).loc[:, ["prediction"]]
        res_pred.append(pred_df)

        print(
            f"Fold {i} AUC score: {roc_auc_score(test.target.values, pred_df.prediction.values)}"
        )
        # tabular_mode.save_model(f"Analysis/basic_tabnet_rep{i}")

        ns = 20000
        nrep = int(test_data.shape[0] / ns)
        nlist = []
        for i in range(nrep):
            pp = tabular_mode.predict(test_data.iloc[np.arange(
                ns * i, ns * (i + 1))])
            nlist.append(pp)

        pred = pd.concat(nlist)
        res_test.append(pred)

    pred_df = pd.concat(
        [res_testi.loc[:, ["0_probability"]] for res_testi in res_test],
        axis=1).apply(np.mean, axis=1)
    pred_df2 = pred_df.map(lambda x: 0 if x > 0.5 else 1)

    sample_submisson = pd.read_csv("Data/sample_submission.csv")
    sample_submisson["target"] = pred_df2.values

    # ns = 20000
    # nrep = int(test_data.shape[0] / ns)
    # nlist = []
    # for i in range(nrep):
    #     pp = tabular_mode.predict(test_data.iloc[np.arange(ns * i, ns * (i + 1))])
    #     nlist.append(pp)

    # #New Predictions as DataFrame
    pred_tot = pd.concat(res_pred).sort_index()

    print_metrics(data['target'], pred_tot["prediction"], tag="Holdout")

    # pred_df = pd.concat([res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1)
    # pred_df2 = pred_df.map(lambda x: 1 if x>0.5 else 0)

    # sample_submisson = pd.read_csv("Data/sample_submission.csv")
    # sample_submisson["target"] = pred_tot.prediction.values

    sample_submisson.to_csv("Analysis/submission_2_node.csv", index=False)

    print(confusion_matrix(data['target'], pred_tot["prediction"]))
示例#12
0
        'target'
    ],  #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=
    True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=1,  #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU",  # Activation between each layers
    learning_rate=1e-3)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
tabular_model.save_model("Analysis/basic")
loaded_model = TabularModel.load_from_checkpoint("Analysis/basic")
示例#13
0
def test2(nrows=10000):
    """
       python source/models/torch_tabular.py test

    """
    global model, session

    #X = np.random.rand(10000,20)
    #y = np.random.binomial(n=1, p=0.5, size=[10000])
    BASE_DIR = Path.home().joinpath('data/input/covtype/')
    datafile = BASE_DIR.joinpath('covtype.data.gz')
    datafile.parent.mkdir(parents=True, exist_ok=True)
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
    if not datafile.exists():
        wget.download(url, datafile.as_posix())

    target_name = ["Covertype"]
    colcat = [
        "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
        "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3",
        "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8",
        "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12",
        "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16",
        "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20",
        "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
        "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28",
        "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32",
        "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36",
        "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"
    ]
    colnum = [
        "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points"
    ]

    feature_columns = (colnum + colcat + target_name)

    df = pd.read_csv(datafile, header=None, names=feature_columns, nrows=nrows)

    df.head()
    train, test = train_test_split(df, random_state=42)
    train, val = train_test_split(train, random_state=42)
    num_classes = len(set(train[target_name].values.ravel()))

    data_config = DataConfig(
        target=target_name,
        continuous_cols=colnum,
        categorical_cols=colcat,
        continuous_feature_transform=None,  #"quantile_normal",
        normalize_continuous_features=False)
    model_config = CategoryEmbeddingModelConfig(task="classification",
                                                metrics=["f1", "accuracy"],
                                                metrics_params=[{
                                                    "num_classes":
                                                    num_classes
                                                }, {}])

    trainer_config = TrainerConfig(gpus=None, fast_dev_run=True)
    experiment_config = ExperimentConfig(
        project_name="PyTorch Tabular Example",
        run_name="node_forest_cov",
        exp_watch="gradients",
        log_target="wandb",
        log_logits=True)
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        # experiment_config=experiment_config,
    )

    tabular_model.fit(train=train, validation=val)
    result = tabular_model.evaluate(val)
    log(result)

    test.drop(columns=target_name, inplace=True)
    pred_df = tabular_model.predict(val.iloc[:100, :])

    log(pred_df)
def main_64():
    # Generate Synthetic Data
    global train
    data, test_data, cat_col_names, num_col_names = data_load()
    bsize = 2500*3*2*2

    # ##########Define the Configs############
    data_config = DataConfig(
        target=["target"],
        continuous_cols=num_col_names,
        categorical_cols=cat_col_names,
        num_workers=4
    )
    trainer_config = TrainerConfig(
        auto_lr_find=True,
        batch_size=bsize,
        max_epochs=100,
        gpus=1
    )
    optimizer_config = OptimizerConfig()

    model_config = TabNetModelConfig(
        task="classification",
        learning_rate=1e-3*bsize/1024,
        n_d=64,
        n_a=64,
        n_steps=5,
        gamma=1.3
    )

    # Training the Model
    # tabular_mode.fit(train=train, validation=val)
    # # Evaluating the Model
    # # #Loss and Metrics on New Data¶
    # result = tabular_mode.evaluate(test)

    cv = StratifiedKFold(n_splits=10, shuffle=True)

    res_pred = []
    res_test = []
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data, y=data.target.values)):
        train, test = data.iloc[train_idx], data.iloc[test_idx]
        train, val = train_test_split(train, random_state=42)

        tabular_mode = TabularModel(
            data_config=data_config,
            optimizer_config=optimizer_config,
            model_config=model_config,
            trainer_config=trainer_config
        )
        weighted_loss = get_class_weighted_cross_entropy(train["target"].values.ravel(), mu=0.1)

        # Training the Model
        tabular_mode.fit(train=train, validation=val, max_epochs=100, loss=weighted_loss)
        pred_df = tabular_mode.predict(test).loc[:, ["prediction"]]
        res_pred.append(pred_df)
        tabular_mode.save_model(f"Analysis/basic_tabnet_rep{i}")

        pred = tabular_mode.predict(test_data)
        res_test.append(pred)

    # #New Predictions as DataFrame
    pred_tot = pd.concat(res_pred).sort_index()

    print_metrics(data['target'], pred_tot["prediction"], tag="Holdout")

    pred_df = pd.concat([res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1)
    pred_df2 = pred_df.map(lambda x: 1 if x>0.5 else 0)

    sample_submisson = pd.read_csv("Data/sample_submission.csv")
    sample_submisson["target"] = pred_df2.values

    sample_submisson.to_csv("Analysis/submission_2.csv", index=False)

    print(confusion_matrix(data['target'], pred_tot["prediction"]))
def main():
    # Generate Synthetic Data
    data, cat_col_names, num_col_names = data_load()
    bsize = 1024

    # ##########Define the Configs############
    data_config = DataConfig(target=["target"],
                             continuous_cols=num_col_names,
                             categorical_cols=cat_col_names)
    trainer_config = TrainerConfig(auto_lr_find=True,
                                   batch_size=bsize,
                                   max_epochs=100,
                                   gpus=1)
    optimizer_config = OptimizerConfig()

    model_config = CategoryEmbeddingModelConfig(task="classification",
                                                layers="1024-512-512",
                                                activation="LeakyReLU",
                                                learning_rate=1e-3)

    tabular_mode = TabularModel(data_config=data_config,
                                model_config=model_config,
                                optimizer_config=optimizer_config,
                                trainer_config=trainer_config)

    # Training the Model
    # tabular_mode.fit(train=train, validation=val)
    # # Evaluating the Model
    # # #Loss and Metrics on New Data¶
    # result = tabular_mode.evaluate(test)

    cv = StratifiedKFold(n_splits=10, shuffle=True)

    res_pred = []
    for train_idx, test_idx in cv.split(X=data, y=data.target.values):
        train, test = data.iloc[train_idx], data.iloc[test_idx]
        train, val = train_test_split(train, random_state=42)

        tabular_mode = TabularModel(data_config=data_config,
                                    model_config=model_config,
                                    optimizer_config=optimizer_config,
                                    trainer_config=trainer_config)

        weighted_loss = get_class_weighted_cross_entropy(
            train["target"].values.ravel(), mu=0.1)

        # Training the Model
        tabular_mode.fit(train=train,
                         validation=val,
                         max_epochs=100,
                         loss=weighted_loss)
        pred_df = tabular_mode.predict(test).loc[:, ["prediction"]]
        res_pred.append(pred_df)

    # #New Predictions as DataFrame
    pred_tot = pd.concat(res_pred).sort_index()

    print_metrics(data['target'], pred_tot["prediction"], tag="Holdout")

    confusion_matrix(data['target'], pred_tot["prediction"])

    # saving model
    tabular_mode.save_model("Analysis/basic")
示例#16
0
def test_pretrained_backbone(
    regression_data,
    model_config_class,
    continuous_cols,
    categorical_cols,
    custom_metrics,
    custom_loss,
    custom_optimizer,
    tmpdir,
):
    (train, test, target) = regression_data
    data_config = DataConfig(
        target=target,
        continuous_cols=continuous_cols,
        categorical_cols=categorical_cols,
    )

    model_config_class, model_config_params = model_config_class
    model_config_params["task"] = "ssl"
    model_config_params["ssl_task"] = "Denoising"
    model_config_params["aug_task"] = "cutmix"
    model_config = model_config_class(**model_config_params)
    trainer_config = TrainerConfig(
        max_epochs=3,
        checkpoints=None,
        early_stopping=None,
        gpus=None,
        fast_dev_run=True,
    )
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    tabular_model.fit(
        train=train,
        test=test,
        metrics=custom_metrics,
        loss=custom_loss,
        optimizer=custom_optimizer,
        optimizer_params={},
    )
    result_1 = tabular_model.evaluate(test)
    with pytest.raises(AssertionError):
        tabular_model.predict(test)
    assert "test_mean_squared_error" in result_1[0].keys()
    sv_dir = tmpdir.mkdir("saved_model")
    tabular_model.save_model(str(sv_dir))
    old_mdl = TabularModel.load_from_checkpoint(str(sv_dir))
    model_config_params["task"] = "regression"
    model_config_params["ssl_task"] = None
    model_config_params["aug_task"] = None
    model_config = model_config_class(**model_config_params)
    trainer_config = TrainerConfig(
        max_epochs=1,
        checkpoints=None,
        early_stopping=None,
        gpus=None,
        fast_dev_run=True,
    )
    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    tabular_model.fit(
        train=train,
        test=test,
        metrics=custom_metrics,
        loss=custom_loss,
        optimizer=custom_optimizer,
        optimizer_params={},
        trained_backbone=old_mdl.model.backbone,
    )
    result_2 = tabular_model.evaluate(test)
    assert "test_mean_squared_error" in result_2[0].keys()