Пример #1
0
def apply_test_data():
    data, test_data, cat_col_names, num_col_names = data_load()
    bsize = 2500 * 3 * 2 * 2

    data_config = DataConfig(target=["target"],
                             continuous_cols=num_col_names,
                             categorical_cols=cat_col_names,
                             num_workers=4)
    trainer_config = TrainerConfig(auto_lr_find=True,
                                   batch_size=bsize,
                                   max_epochs=100,
                                   gpus=1)
    optimizer_config = OptimizerConfig()

    model_config = TabNetModelConfig(task="classification",
                                     learning_rate=1e-3 * bsize / 1024,
                                     n_d=24,
                                     n_a=24,
                                     n_steps=5,
                                     gamma=1.3)

    tabular_mode = TabularModel(data_config=data_config,
                                optimizer_config=optimizer_config,
                                model_config=model_config,
                                trainer_config=trainer_config)

    for i in range(10):
        diri = f"Analysis/basic_tabnet_rep{i}"
        tabular_mode.load_from_checkpoint(dir=diri)
Пример #2
0
def test_classification(
    classification_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    num_gaussian
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="classification")
        mdn_config = MixtureDensityHeadConfig(num_gaussian=num_gaussian)
        model_config_params['mdn_config'] = mdn_config
        model_config = CategoryEmbeddingMDNConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3, checkpoints=None, early_stopping=None, gpus=0, fast_dev_run=True
        )
        optimizer_config = OptimizerConfig()
        with pytest.raises(AssertionError):
            tabular_model = TabularModel(
                data_config=data_config,
                model_config=model_config,
                optimizer_config=optimizer_config,
                trainer_config=trainer_config,
            )
            tabular_model.fit(train=train, test=test)
Пример #3
0
def test_feature_extractor(
    regression_data,
    model_config_class,
    continuous_cols,
    categorical_cols,
):
    (train, test, target) = regression_data
    data_config = DataConfig(
        target=target,
        continuous_cols=continuous_cols,
        categorical_cols=categorical_cols,
    )
    model_config_params = dict(task="regression")
    model_config = model_config_class(**model_config_params)
    trainer_config = TrainerConfig(max_epochs=3,
                                   checkpoints=None,
                                   early_stopping=None,
                                   gpus=0,
                                   fast_dev_run=True)
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    tabular_model.fit(
        train=train,
        test=test,
    )
    dt = DeepFeatureExtractor(tabular_model)
    enc_df = dt.fit_transform(test)
    assert any([col for col in enc_df.columns if "backbone" in col])
Пример #4
0
def test_embedding_transformer(regression_data):
    (train, test, target) = regression_data
    data_config = DataConfig(
        target=target,
        continuous_cols=[
            "AveRooms",
            "AveBedrms",
            "Population",
            "AveOccup",
            "Latitude",
            "Longitude",
        ],
        categorical_cols=["HouseAgeBin"],
    )
    model_config_params = dict(
        task="regression",
        input_embed_dim=8,
        num_attn_blocks=1,
        num_heads=2,
    )
    model_config = TabTransformerConfig(**model_config_params)
    trainer_config = TrainerConfig(
        max_epochs=1,
        checkpoints=None,
        early_stopping=None,
        gpus=None,
        fast_dev_run=True,
    )
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    tabular_model.fit(train=train, test=test)

    transformer = CategoricalEmbeddingTransformer(tabular_model)
    train_transform = transformer.fit_transform(train)
    embed_cols = [
        col for col in train_transform.columns
        if "HouseAgeBin_embed_dim" in col
    ]
    assert len(train["HouseAgeBin"].unique()) + 1 == len(
        transformer._mapping["HouseAgeBin"].keys())
    assert all([
        val.shape[0] == len(embed_cols)
        for val in transformer._mapping["HouseAgeBin"].values()
    ])
Пример #5
0
def test_ssl(
    classification_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    ssl_task,
    aug_task,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(
            task="ssl",
            input_embed_dim=8,
            num_attn_blocks=1,
            num_heads=2,
            ssl_task=ssl_task,
            aug_task=aug_task,
        )
        model_config = TabTransformerConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=1,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "test_mean_squared_error" in result[0].keys()
Пример #6
0
    def __init__(self, model_pars=None, data_pars=None, compute_pars=None):
        self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars

        if model_pars is None:
            self.model = None

        else:
            ###############################################################
            dm = data_pars['cols_model_group_custom']
            data_config = DataConfig(
                target=dm[
                    'coly'],  #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
                continuous_cols=dm['colnum'],
                categorical_cols=dm['colcat'],
            )

            model_config = CategoryEmbeddingModelConfig(
                **model_pars['model_pars'], )
            trainer_config = TrainerConfig(
                **compute_pars.get('compute_pars', {}))
            optimizer_config = OptimizerConfig()

            self.config_pars = {
                'data_config': data_config,
                'model_config': model_config,
                'optimizer_config': optimizer_config,
                'trainer_config': trainer_config,
            }

            self.model = TabularModel(**self.config_pars)
            self.guide = None
            self.pred_summary = None  ### All MC summary

            if VERBOSE: log(self.guide, self.model)
Пример #7
0
def test_ssl(
    regression_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    deep_layers,
    batch_norm_continuous_input,
    attention_pooling,
    ssl_task,
    aug_task,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="ssl", ssl_task=ssl_task, aug_task=aug_task)
        model_config_params["deep_layers"] = deep_layers
        model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input
        model_config_params["attention_pooling"] = attention_pooling
        model_config = AutoIntConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "test_mean_squared_error" in result[0].keys()
def test_ssl(regression_data, continuous_cols, categorical_cols,
             continuous_feature_transform, normalize_continuous_features,
             target_range, ssl_task, aug_task):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="ssl",
                                   input_embed_dim=8,
                                   num_attn_blocks=1,
                                   num_heads=2,
                                   ssl_task=ssl_task,
                                   aug_task=aug_task)
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append((
                    float(train[target].min()),
                    float(train[target].max()),
                ))
            model_config_params["target_range"] = _target_range
        model_config = FTTransformerConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=1,
                                       checkpoints=None,
                                       early_stopping=None,
                                       gpus=None,
                                       fast_dev_run=True)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "test_mean_squared_error" in result[0].keys()
Пример #9
0
    def __init__(self, model_pars=None, data_pars=None, compute_pars=None):
        self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars

        if model_pars is None:
            self.model = None

        else:
            ###############################################################
            dm = data_pars['cols_model_type2']
            data_config = DataConfig(
                target=dm[
                    'coly'],  #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
                continuous_cols=dm['colcontinuous'],
                categorical_cols=dm['colsparse'],
            )

            class_name = model_pars.get(
                'model_class', "CategoryEmbeddingModelConfig").split("::")[-1]
            assert class_name in MODEL_DICT, "ModelConfig not available"

            # Pick the needed ModelConfig  ####################################
            model_class = MODEL_DICT[class_name]
            if class_name == "CategoryEmbeddingMDNConfig":  ### Mixture Desnsity Model
                ## Check https://github.com/manujosephv/pytorch_tabular/blob/main/tests/test_mdn.py#L99
                self.model_pars['model_pars'][
                    'mdn_config'] = MixtureDensityHeadConfig(
                        num_gaussian=self.model_pars['model_pars']
                        ['num_gaussian'])
                # remove these as they cause errors to other modelConfigs
                del self.model_pars['model_pars']['num_gaussian']

            else:
                for x in ['num_gaussian', 'mdn_config']:
                    try:
                        del self.model_pars['model_pars'][x]
                    except:
                        pass

            model_config = model_class(**model_pars['model_pars'])
            # Remove it since it's unused for other models and can cause errors
            # del self.model_pars['model_pars']['mdn_config']
            trainer_config = TrainerConfig(**compute_pars.get(
                'compute_pars', {}))  # For testing quickly, max_epochs=1 )
            optimizer_config = OptimizerConfig(
                **compute_pars.get('optimizer_pars', {}))

            self.config_pars = {
                'data_config': data_config,
                'model_config': model_config,
                'optimizer_config': optimizer_config,
                'trainer_config': trainer_config,
            }

            self.model = TabularModel(**self.config_pars)
            self.guide = None
            self.pred_summary = None  ### All MC summary

            log(self.guide, self.model)
Пример #10
0
def test_ssl(
    classification_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    ssl_task,
    aug_task,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = {
            "task": "ssl",
            "ssl_task": ssl_task,
            "aug_task": aug_task,
        }
        model_config = TabNetModelConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        with pytest.raises(AssertionError):
            tabular_model = TabularModel(
                data_config=data_config,
                model_config=model_config,
                optimizer_config=optimizer_config,
                trainer_config=trainer_config,
            )
            tabular_model.fit(train=train, test=test)
Пример #11
0
def test_save_load(
    regression_data,
    model_config_class,
    continuous_cols,
    categorical_cols,
    custom_metrics,
    custom_loss,
    custom_optimizer,
    tmpdir,
):
    (train, test, target) = regression_data
    data_config = DataConfig(
        target=target,
        continuous_cols=continuous_cols,
        categorical_cols=categorical_cols,
    )
    model_config_class, model_config_params = model_config_class
    model_config_params["task"] = "regression"
    model_config = model_config_class(**model_config_params)
    trainer_config = TrainerConfig(
        max_epochs=3,
        checkpoints=None,
        early_stopping=None,
        gpus=None,
        fast_dev_run=True,
    )
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    tabular_model.fit(
        train=train,
        test=test,
        metrics=custom_metrics,
        loss=custom_loss,
        optimizer=custom_optimizer,
        optimizer_params={},
    )

    result_1 = tabular_model.evaluate(test)
    # sv_dir = tmpdir/"save_model"
    # sv_dir.mkdir(exist_ok=True, parents=True)
    sv_dir = tmpdir.mkdir("saved_model")
    tabular_model.save_model(str(sv_dir))
    new_mdl = TabularModel.load_from_checkpoint(str(sv_dir))
    result_2 = new_mdl.evaluate(test)
    assert (result_1[0][f"test_{tabular_model.model.hparams.metrics[0]}"] ==
            result_2[0][f"test_{new_mdl.model.hparams.metrics[0]}"])
Пример #12
0
def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
    deep_layers,
    batch_norm_continuous_input,
    attention_pooling,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="regression")
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append(
                    (
                        float(train[target].min()),
                        float(train[target].max()),
                    )
                )
            model_config_params["target_range"] = _target_range
        model_config_params["deep_layers"] = deep_layers
        model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input
        model_config_params["attention_pooling"] = attention_pooling
        model_config = AutoIntConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "test_mean_squared_error" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
Пример #13
0
def test_date_encoding(timeseries_data, freq):
    (train, test, target) = timeseries_data
    train, valid = train_test_split(train, random_state=42)
    data_config = DataConfig(
        target=target + ["Occupancy"],
        continuous_cols=[
            "Temperature", "Humidity", "Light", "CO2", "HumidityRatio"
        ],
        categorical_cols=[],
        date_columns=[("date", freq)],
        encode_date_columns=True,
    )
    model_config_params = dict(task="regression")
    model_config = CategoryEmbeddingModelConfig(**model_config_params)
    trainer_config = TrainerConfig(max_epochs=1,
                                   checkpoints=None,
                                   early_stopping=None)
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    config = tabular_model.config
    datamodule = TabularDatamodule(
        train=train,
        validation=valid,
        config=config,
        test=test,
    )
    datamodule.prepare_data()
    if freq != "S":
        datamodule.setup("fit")
        config = datamodule.config
        if freq == "H":
            assert "_Hour" in datamodule.train.columns
        elif freq == "D":
            assert "_Dayofyear" in datamodule.train.columns
        elif freq == "T":
            assert "_Minute" in datamodule.train.columns
    elif freq == "S":
        try:
            datamodule.setup("fit")
            assert False
        except RuntimeError:
            assert True
Пример #14
0
def test_regression(
    regression_data,
    multi_target,
    embed_categorical,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(
            task="regression",
            depth=2,
            num_trees=50,
            embed_categorical=embed_categorical,
        )
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append((
                    float(train[target].min()),
                    float(train[target].max()),
                ))
            model_config_params["target_range"] = _target_range
        model_config = NodeConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=1,
                                       checkpoints=None,
                                       early_stopping=None,
                                       gpus=None,
                                       fast_dev_run=True)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "test_mean_squared_error" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
Пример #15
0
def load_model(path=""):
    global model, session
    import cloudpickle as pickle
    model0 = pickle.load(open(path + '/model/model.pkl', mode='rb'))
         
    model = Model()  # Empty model
    model.model_pars   = model0.model_pars
    model.compute_pars = model0.compute_pars
    model.data_pars    = model0.data_pars

    ### Custom part
    # model.model        = TabularModel.load_from_checkpoint( "ztmp/data/output/torch_tabular/torch_checkpoint")
    model.model        = TabularModel.load_from_checkpoint(  path +"/model/torch_checkpoint")
 
    session = None
    return model, session
Пример #16
0
    def __init__(self, model_pars=None, data_pars=None, compute_pars=None):
        self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars

        if model_pars is None:
            self.model = None

        else:
            ###############################################################
            dm          = data_pars['cols_model_group_custom']
            data_config = DataConfig(
              target           = dm['coly'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
              continuous_cols  = dm['colnum'],
              categorical_cols = dm['colcat'],
            )

            class_name   = model_pars.get('model_class',  "CategoryEmbeddingModelConfig" ).split("::")[-1]
            assert class_name in MODEL_DICT, "ModelConfig not available"
            log2(class_name)

            # Pick the needed ModelConfig
            #model_class = globals()[ class_name ]

            model_class = None
            if class_name == "CategoryEmbeddingModelConfig":
                model_class = CategoryEmbeddingModelConfig
            elif class_name == "TabNetModelConfig":
                model_class = TabNetModelConfig
            else:
                model_class = NodeConfig
            #model_class  = MODEL_DICT[class_name]

            model_config     = model_class( **self.model_pars.get('model_pars', {})   )
            trainer_config   = TrainerConfig( **compute_pars.get('compute_pars', {} )) # For testing quickly, max_epochs=1 )
            optimizer_config = OptimizerConfig(**compute_pars.get('optimizer_pars', {} ))

            self.config_pars = { 'data_config' : data_config,
                        'model_config'         : model_config,
                        'optimizer_config'     : optimizer_config,
                        'trainer_config'       : trainer_config,
            }

            self.model = TabularModel(**self.config_pars)
            self.guide = None
            self.pred_summary = None  ### All MC summary

            if VERBOSE: log( self.model)
Пример #17
0
def test2(nrows=10000):
    """
       python source/models/torch_tabular.py test

    """
    global model, session


    df,colcat, colnum, coly = test_dataset_covtype(1000)
    target_name =  coly

    df.head()
    train, test = train_test_split(df, random_state=42)
    train, val  = train_test_split(train, random_state=42)
    num_classes = len(set(train[target_name].values.ravel()))


    data_config = DataConfig(
        target=target_name,
        continuous_cols=colnum,
        categorical_cols=colcat,
        continuous_feature_transform=None,#"quantile_normal",
        normalize_continuous_features=False
    )
    model_config = CategoryEmbeddingModelConfig(task="classification",
                                                metrics=["f1","accuracy"],
                                                metrics_params=[{"num_classes":num_classes},{}])

    trainer_config = TrainerConfig(gpus=None, fast_dev_run=True)
    experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example",
                                         run_name="node_forest_cov",
                                         exp_watch="gradients",
                                         log_target="wandb",
                                         log_logits=True)
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        # experiment_config=experiment_config,
    )

    tabular_model.fit(  train=train, validation=val)
    result = tabular_model.evaluate(val)
    log(result)
    
    
    test.drop(columns=target_name, inplace=True)
    pred_df = tabular_model.predict(val.iloc[:100,:])

    log(pred_df)
Пример #18
0
def test_save_for_inference(
    regression_data,
    model_config_class,
    continuous_cols,
    categorical_cols,
    custom_metrics,
    custom_loss,
    custom_optimizer,
    save_type,
    tmpdir,
):
    (train, test, target) = regression_data
    data_config = DataConfig(
        target=target,
        continuous_cols=continuous_cols,
        categorical_cols=categorical_cols,
    )
    model_config_class, model_config_params = model_config_class
    model_config_params["task"] = "regression"
    model_config = model_config_class(**model_config_params)
    trainer_config = TrainerConfig(
        max_epochs=3,
        checkpoints=None,
        early_stopping=None,
        gpus=None,
        fast_dev_run=True,
    )
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    tabular_model.fit(
        train=train,
        test=test,
        metrics=custom_metrics,
        loss=custom_loss,
        optimizer=custom_optimizer,
        optimizer_params={},
    )
    sv_dir = tmpdir.mkdir("saved_model")

    tabular_model.save_model_for_inference(
        sv_dir / "model.pt" if type == "pytorch" else sv_dir / "model.onnx",
        kind=save_type,
    )
    assert os.path.exists(sv_dir /
                          "model.pt" if type == "pytorch" else sv_dir /
                          "model.onnx")
def main():
    # Generate Synthetic Data
    data, cat_col_names, num_col_names = make_mixed_classification(
        n_samples=10000, n_features=20, n_categories=4)
    train, test = train_test_split(data, random_state=42)
    train, val = train_test_split(train, random_state=42)

    # ##########Define the Configs############
    data_config = DataConfig(target=["target"],
                             continuous_cols=num_col_names,
                             categorical_cols=cat_col_names)
    trainer_config = TrainerConfig(auto_lr_find=True,
                                   batch_size=1024,
                                   max_epochs=100,
                                   gpus=1)
    optimizer_config = OptimizerConfig()

    model_config = CategoryEmbeddingModelConfig(task="classification",
                                                layers="1024-512-512",
                                                activation="LeakyReLU",
                                                learning_rate=1e-3)

    tabular_mode = TabularModel(data_config=data_config,
                                model_config=model_config,
                                optimizer_config=optimizer_config,
                                trainer_config=trainer_config)

    # Training the Model
    tabular_mode.fit(train=train, validation=val)
    # Evaluating the Model
    # #Loss and Metrics on New Data¶
    result = tabular_mode.evaluate(test)

    # #New Predictions as DataFrame
    pred_df = tabular_mode.predict(test)
    pred_df.head()

    print_metrics(test['target'], pred_df["prediction"], tag="Holdout")

    # saving model
    tabular_mode.save_model("Analysis/basic")
def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="regression")
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append((
                    train[target].min().item(),
                    train[target].max().item(),
                ))
            model_config_params["target_range"] = _target_range
        model_config = CategoryEmbeddingModelConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=3,
                                       checkpoints=None,
                                       early_stopping=None,
                                       gpus=0)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "valid_loss" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
Пример #21
0
def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    variant,
    num_gaussian,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="regression")
        mdn_config = MixtureDensityHeadConfig(num_gaussian=num_gaussian)
        model_config_params["mdn_config"] = mdn_config
        model_config = variant(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "test_mean_squared_error" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
Пример #22
0
def test_classification(
    classification_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    deep_layers,
    batch_norm_continuous_input,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="classification")
        model_config_params["deep_layers"] = deep_layers
        model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input
        model_config = AutoIntConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "test_accuracy" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
Пример #23
0
def test_classification(
    classification_data,
    continuous_cols,
    categorical_cols,
    embed_categorical,
    continuous_feature_transform,
    normalize_continuous_features,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(
            task="classification",
            depth=2,
            num_trees=50,
            embed_categorical=embed_categorical,
        )
        model_config = NodeConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=1,
                                       checkpoints=None,
                                       early_stopping=None,
                                       gpus=None,
                                       fast_dev_run=True)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        assert "test_accuracy" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
def test_classification(
    classification_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
):
    (train, test, target) = classification_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="classification")
        model_config = CategoryEmbeddingModelConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=3,
                                       checkpoints=None,
                                       early_stopping=None,
                                       gpus=0)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(train=train, test=test)

        result = tabular_model.evaluate(test)
        # print(result[0]["valid_loss"])
        assert "valid_loss" in result[0].keys()
        pred_df = tabular_model.predict(test)
        assert pred_df.shape[0] == test.shape[0]
def main_64():
    # Generate Synthetic Data
    global train
    data, test_data, cat_col_names, num_col_names = data_load()
    bsize = 2500*3*2*2

    # ##########Define the Configs############
    data_config = DataConfig(
        target=["target"],
        continuous_cols=num_col_names,
        categorical_cols=cat_col_names,
        num_workers=4
    )
    trainer_config = TrainerConfig(
        auto_lr_find=True,
        batch_size=bsize,
        max_epochs=100,
        gpus=1
    )
    optimizer_config = OptimizerConfig()

    model_config = TabNetModelConfig(
        task="classification",
        learning_rate=1e-3*bsize/1024,
        n_d=64,
        n_a=64,
        n_steps=5,
        gamma=1.3
    )

    # Training the Model
    # tabular_mode.fit(train=train, validation=val)
    # # Evaluating the Model
    # # #Loss and Metrics on New Data¶
    # result = tabular_mode.evaluate(test)

    cv = StratifiedKFold(n_splits=10, shuffle=True)

    res_pred = []
    res_test = []
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data, y=data.target.values)):
        train, test = data.iloc[train_idx], data.iloc[test_idx]
        train, val = train_test_split(train, random_state=42)

        tabular_mode = TabularModel(
            data_config=data_config,
            optimizer_config=optimizer_config,
            model_config=model_config,
            trainer_config=trainer_config
        )
        weighted_loss = get_class_weighted_cross_entropy(train["target"].values.ravel(), mu=0.1)

        # Training the Model
        tabular_mode.fit(train=train, validation=val, max_epochs=100, loss=weighted_loss)
        pred_df = tabular_mode.predict(test).loc[:, ["prediction"]]
        res_pred.append(pred_df)
        tabular_mode.save_model(f"Analysis/basic_tabnet_rep{i}")

        pred = tabular_mode.predict(test_data)
        res_test.append(pred)

    # #New Predictions as DataFrame
    pred_tot = pd.concat(res_pred).sort_index()

    print_metrics(data['target'], pred_tot["prediction"], tag="Holdout")

    pred_df = pd.concat([res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1)
    pred_df2 = pred_df.map(lambda x: 1 if x>0.5 else 0)

    sample_submisson = pd.read_csv("Data/sample_submission.csv")
    sample_submisson["target"] = pred_df2.values

    sample_submisson.to_csv("Analysis/submission_2.csv", index=False)

    print(confusion_matrix(data['target'], pred_tot["prediction"]))
def test_ssl(
    regression_data,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
    target_transform,
    custom_metrics,
    custom_loss,
    custom_optimizer,
    ssl_task,
    aug_task,
):
    (train, test, target) = regression_data
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
        )
        model_config_params = dict(task="ssl",
                                   ssl_task=ssl_task,
                                   aug_task=aug_task)
        if target_range:
            _target_range = []
            for target in data_config.target:
                _target_range.append((
                    float(train[target].min()),
                    float(train[target].max()),
                ))
            model_config_params["target_range"] = _target_range
        model_config = CategoryEmbeddingModelConfig(**model_config_params)
        trainer_config = TrainerConfig(
            max_epochs=3,
            checkpoints=None,
            early_stopping=None,
            gpus=None,
            fast_dev_run=True,
        )
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        tabular_model.fit(
            train=train,
            test=test,
            metrics=custom_metrics,
            target_transform=target_transform,
            loss=custom_loss,
            optimizer=custom_optimizer,
            optimizer_params={},
        )

        result = tabular_model.evaluate(test)
        if custom_metrics is None:
            assert "test_mean_squared_error" in result[0].keys()
        else:
            assert "test_fake_metric" in result[0].keys()
Пример #27
0
def main():
    # Generate Synthetic Data
    data, test_data, cat_col_names, num_col_names = data_load()

    bsize = 2500 * 2

    # ##########Define the Configs############
    data_config = DataConfig(target=["target"],
                             continuous_cols=num_col_names,
                             categorical_cols=cat_col_names,
                             num_workers=4)
    trainer_config = TrainerConfig(auto_lr_find=True,
                                   batch_size=bsize,
                                   max_epochs=100,
                                   gpus=1)
    optimizer_config = OptimizerConfig()

    # model_config = TabNetModelConfig(
    #     task="classification",
    #     learning_rate=1e-3*bsize/1024,
    #     n_d=16,
    #     n_a=16,
    #     n_steps=5,
    #     gamma=1.3
    # )

    model_config = NodeConfig(
        task="classification",
        num_layers=2,  # Number of Dense Layers
        num_trees=1024,  # Number of Trees in each layer
        depth=3,  # Depth of each Tree
        embed_categorical=True,
        # If True, will use a learned embedding, else it will use LeaveOneOutEncoding for categorical columns
        learning_rate=1e-3,
        additional_tree_output_dim=5)

    # Training the Model
    # tabular_mode.fit(train=train, validation=val)
    # # Evaluating the Model
    # # #Loss and Metrics on New Data¶
    # result = tabular_mode.evaluate(test)

    cv = StratifiedKFold(n_splits=10, shuffle=True)

    res_pred = []
    res_test = []
    for i, (train_idx,
            test_idx) in enumerate(cv.split(X=data, y=data.target.values)):
        train, test = data.iloc[train_idx], data.iloc[test_idx]
        train, val = train_test_split(train, random_state=42)

        tabular_mode = TabularModel(data_config=data_config,
                                    optimizer_config=optimizer_config,
                                    model_config=model_config,
                                    trainer_config=trainer_config)
        weighted_loss = get_class_weighted_cross_entropy(
            train["target"].values.ravel(), mu=0.1)

        # Training the Model
        tabular_mode.fit(train=train,
                         validation=val,
                         max_epochs=100,
                         loss=weighted_loss)
        pred_df = tabular_mode.predict(test).loc[:, ["prediction"]]
        res_pred.append(pred_df)

        print(
            f"Fold {i} AUC score: {roc_auc_score(test.target.values, pred_df.prediction.values)}"
        )
        # tabular_mode.save_model(f"Analysis/basic_tabnet_rep{i}")

        ns = 20000
        nrep = int(test_data.shape[0] / ns)
        nlist = []
        for i in range(nrep):
            pp = tabular_mode.predict(test_data.iloc[np.arange(
                ns * i, ns * (i + 1))])
            nlist.append(pp)

        pred = pd.concat(nlist)
        res_test.append(pred)

    pred_df = pd.concat(
        [res_testi.loc[:, ["0_probability"]] for res_testi in res_test],
        axis=1).apply(np.mean, axis=1)
    pred_df2 = pred_df.map(lambda x: 0 if x > 0.5 else 1)

    sample_submisson = pd.read_csv("Data/sample_submission.csv")
    sample_submisson["target"] = pred_df2.values

    # ns = 20000
    # nrep = int(test_data.shape[0] / ns)
    # nlist = []
    # for i in range(nrep):
    #     pp = tabular_mode.predict(test_data.iloc[np.arange(ns * i, ns * (i + 1))])
    #     nlist.append(pp)

    # #New Predictions as DataFrame
    pred_tot = pd.concat(res_pred).sort_index()

    print_metrics(data['target'], pred_tot["prediction"], tag="Holdout")

    # pred_df = pd.concat([res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1)
    # pred_df2 = pred_df.map(lambda x: 1 if x>0.5 else 0)

    # sample_submisson = pd.read_csv("Data/sample_submission.csv")
    # sample_submisson["target"] = pred_tot.prediction.values

    sample_submisson.to_csv("Analysis/submission_2_node.csv", index=False)

    print(confusion_matrix(data['target'], pred_tot["prediction"]))
Пример #28
0
def test_dataloader(
    regression_data,
    validation_split,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_transform,
    embedding_dims,
):
    (train, test, target) = regression_data
    train, valid = train_test_split(train, random_state=42)
    if len(continuous_cols) + len(categorical_cols) == 0:
        assert True
    else:
        data_config = DataConfig(
            target=target + ["MedInc"] if multi_target else target,
            continuous_cols=continuous_cols,
            categorical_cols=categorical_cols,
            continuous_feature_transform=continuous_feature_transform,
            normalize_continuous_features=normalize_continuous_features,
            validation_split=validation_split,
        )
        model_config_params = dict(task="regression",
                                   embedding_dims=embedding_dims)
        model_config = CategoryEmbeddingModelConfig(**model_config_params)
        trainer_config = TrainerConfig(max_epochs=1,
                                       checkpoints=None,
                                       early_stopping=None)
        optimizer_config = OptimizerConfig()

        tabular_model = TabularModel(
            data_config=data_config,
            model_config=model_config,
            optimizer_config=optimizer_config,
            trainer_config=trainer_config,
        )
        config = tabular_model.config
        datamodule = TabularDatamodule(
            train=train,
            validation=valid,
            config=config,
            test=test,
            target_transform=target_transform,
        )
        datamodule.prepare_data()
        datamodule.setup("fit")
        config = datamodule.config
        if len(categorical_cols) > 0:
            assert config.categorical_cardinality[0] == 5
            if embedding_dims is None:
                assert config.embedding_dims[0][-1] == 3
            else:
                assert config.embedding_dims[0][-1] == embedding_dims[0][-1]
        if normalize_continuous_features and len(continuous_cols) > 0:
            assert round(
                datamodule.train[config.continuous_cols[0]].mean()) == 0
            assert round(
                datamodule.train[config.continuous_cols[0]].std()) == 1
            # assert round(datamodule.validation[config.continuous_cols[0]].mean()) == 0
            # assert round(datamodule.validation[config.continuous_cols[0]].std()) == 1
        val_loader = datamodule.val_dataloader()
        _val_loader = datamodule.prepare_inference_dataloader(valid)
        chk_1 = next(iter(val_loader))["continuous"]
        chk_2 = next(iter(_val_loader))["continuous"]
        assert np.not_equal(chk_1, chk_2).sum().item() == 0
Пример #29
0
        'target'
    ],  #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=
    True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=1,  #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU",  # Activation between each layers
    learning_rate=1e-3)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
tabular_model.save_model("Analysis/basic")
loaded_model = TabularModel.load_from_checkpoint("Analysis/basic")
Пример #30
0
def test2(nrows=10000):
    """
       python source/models/torch_tabular.py test

    """
    global model, session

    #X = np.random.rand(10000,20)
    #y = np.random.binomial(n=1, p=0.5, size=[10000])
    BASE_DIR = Path.home().joinpath('data/input/covtype/')
    datafile = BASE_DIR.joinpath('covtype.data.gz')
    datafile.parent.mkdir(parents=True, exist_ok=True)
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
    if not datafile.exists():
        wget.download(url, datafile.as_posix())

    target_name = ["Covertype"]
    colcat = [
        "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
        "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3",
        "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8",
        "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12",
        "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16",
        "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20",
        "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
        "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28",
        "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32",
        "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36",
        "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"
    ]
    colnum = [
        "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points"
    ]

    feature_columns = (colnum + colcat + target_name)

    df = pd.read_csv(datafile, header=None, names=feature_columns, nrows=nrows)

    df.head()
    train, test = train_test_split(df, random_state=42)
    train, val = train_test_split(train, random_state=42)
    num_classes = len(set(train[target_name].values.ravel()))

    data_config = DataConfig(
        target=target_name,
        continuous_cols=colnum,
        categorical_cols=colcat,
        continuous_feature_transform=None,  #"quantile_normal",
        normalize_continuous_features=False)
    model_config = CategoryEmbeddingModelConfig(task="classification",
                                                metrics=["f1", "accuracy"],
                                                metrics_params=[{
                                                    "num_classes":
                                                    num_classes
                                                }, {}])

    trainer_config = TrainerConfig(gpus=None, fast_dev_run=True)
    experiment_config = ExperimentConfig(
        project_name="PyTorch Tabular Example",
        run_name="node_forest_cov",
        exp_watch="gradients",
        log_target="wandb",
        log_logits=True)
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        # experiment_config=experiment_config,
    )

    tabular_model.fit(train=train, validation=val)
    result = tabular_model.evaluate(val)
    log(result)

    test.drop(columns=target_name, inplace=True)
    pred_df = tabular_model.predict(val.iloc[:100, :])

    log(pred_df)