def __init__(self, model_pars=None, data_pars=None, compute_pars=None): self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars if model_pars is None: self.model = None else: ############################################################### dm = data_pars['cols_model_group_custom'] data_config = DataConfig( target=dm[ 'coly'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented continuous_cols=dm['colnum'], categorical_cols=dm['colcat'], ) model_config = CategoryEmbeddingModelConfig( **model_pars['model_pars'], ) trainer_config = TrainerConfig( **compute_pars.get('compute_pars', {})) optimizer_config = OptimizerConfig() self.config_pars = { 'data_config': data_config, 'model_config': model_config, 'optimizer_config': optimizer_config, 'trainer_config': trainer_config, } self.model = TabularModel(**self.config_pars) self.guide = None self.pred_summary = None ### All MC summary if VERBOSE: log(self.guide, self.model)
def test2(nrows=10000): """ python source/models/torch_tabular.py test """ global model, session df,colcat, colnum, coly = test_dataset_covtype(1000) target_name = coly df.head() train, test = train_test_split(df, random_state=42) train, val = train_test_split(train, random_state=42) num_classes = len(set(train[target_name].values.ravel())) data_config = DataConfig( target=target_name, continuous_cols=colnum, categorical_cols=colcat, continuous_feature_transform=None,#"quantile_normal", normalize_continuous_features=False ) model_config = CategoryEmbeddingModelConfig(task="classification", metrics=["f1","accuracy"], metrics_params=[{"num_classes":num_classes},{}]) trainer_config = TrainerConfig(gpus=None, fast_dev_run=True) experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", run_name="node_forest_cov", exp_watch="gradients", log_target="wandb", log_logits=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, # experiment_config=experiment_config, ) tabular_model.fit( train=train, validation=val) result = tabular_model.evaluate(val) log(result) test.drop(columns=target_name, inplace=True) pred_df = tabular_model.predict(val.iloc[:100,:]) log(pred_df)
def test_regression( regression_data, multi_target, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="regression") if target_range: _target_range = [] for target in data_config.target: _target_range.append(( train[target].min().item(), train[target].max().item(), )) model_config_params["target_range"] = _target_range model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=3, checkpoints=None, early_stopping=None, gpus=0) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) # print(result[0]["valid_loss"]) assert "valid_loss" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def test_date_encoding(timeseries_data, freq): (train, test, target) = timeseries_data train, valid = train_test_split(train, random_state=42) data_config = DataConfig( target=target + ["Occupancy"], continuous_cols=[ "Temperature", "Humidity", "Light", "CO2", "HumidityRatio" ], categorical_cols=[], date_columns=[("date", freq)], encode_date_columns=True, ) model_config_params = dict(task="regression") model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) config = tabular_model.config datamodule = TabularDatamodule( train=train, validation=valid, config=config, test=test, ) datamodule.prepare_data() if freq != "S": datamodule.setup("fit") config = datamodule.config if freq == "H": assert "_Hour" in datamodule.train.columns elif freq == "D": assert "_Dayofyear" in datamodule.train.columns elif freq == "T": assert "_Minute" in datamodule.train.columns elif freq == "S": try: datamodule.setup("fit") assert False except RuntimeError: assert True
def test_embedding_transformer(regression_data): (train, test, target) = regression_data data_config = DataConfig( target=target, continuous_cols=[ "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude", ], categorical_cols=["HouseAgeBin"], ) model_config_params = dict(task="regression") model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) transformer = CategoricalEmbeddingTransformer(tabular_model) train_transform = transformer.fit_transform(train) embed_cols = [ col for col in train_transform.columns if "HouseAgeBin_embed_dim" in col ] assert len(train["HouseAgeBin"].unique()) + 1 == len( transformer._mapping["HouseAgeBin"].keys()) assert all([ val.shape[0] == len(embed_cols) for val in transformer._mapping["HouseAgeBin"].values() ])
def test_classification( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="classification") model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) # print(result[0]["valid_loss"]) assert "test_accuracy" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def main(): # Generate Synthetic Data data, cat_col_names, num_col_names = make_mixed_classification( n_samples=10000, n_features=20, n_categories=4) train, test = train_test_split(data, random_state=42) train, val = train_test_split(train, random_state=42) # ##########Define the Configs############ data_config = DataConfig(target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names) trainer_config = TrainerConfig(auto_lr_find=True, batch_size=1024, max_epochs=100, gpus=1) optimizer_config = OptimizerConfig() model_config = CategoryEmbeddingModelConfig(task="classification", layers="1024-512-512", activation="LeakyReLU", learning_rate=1e-3) tabular_mode = TabularModel(data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config) # Training the Model tabular_mode.fit(train=train, validation=val) # Evaluating the Model # #Loss and Metrics on New Data¶ result = tabular_mode.evaluate(test) # #New Predictions as DataFrame pred_df = tabular_mode.predict(test) pred_df.head() print_metrics(test['target'], pred_df["prediction"], tag="Holdout") # saving model tabular_mode.save_model("Analysis/basic")
def test_ssl( regression_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, target_transform, custom_metrics, custom_loss, custom_optimizer, ssl_task, aug_task, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="ssl", ssl_task=ssl_task, aug_task=aug_task) if target_range: _target_range = [] for target in data_config.target: _target_range.append(( float(train[target].min()), float(train[target].max()), )) model_config_params["target_range"] = _target_range model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit( train=train, test=test, metrics=custom_metrics, target_transform=target_transform, loss=custom_loss, optimizer=custom_optimizer, optimizer_params={}, ) result = tabular_model.evaluate(test) if custom_metrics is None: assert "test_mean_squared_error" in result[0].keys() else: assert "test_fake_metric" in result[0].keys()
def test_dataloader( regression_data, validation_split, multi_target, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_transform, embedding_dims, ): (train, test, target) = regression_data train, valid = train_test_split(train, random_state=42) if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, validation_split=validation_split, ) model_config_params = dict(task="regression", embedding_dims=embedding_dims) model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) config = tabular_model.config datamodule = TabularDatamodule( train=train, validation=valid, config=config, test=test, target_transform=target_transform, ) datamodule.prepare_data() datamodule.setup("fit") config = datamodule.config if len(categorical_cols) > 0: assert config.categorical_cardinality[0] == 5 if embedding_dims is None: assert config.embedding_dims[0][-1] == 3 else: assert config.embedding_dims[0][-1] == embedding_dims[0][-1] if normalize_continuous_features and len(continuous_cols) > 0: assert round( datamodule.train[config.continuous_cols[0]].mean()) == 0 assert round( datamodule.train[config.continuous_cols[0]].std()) == 1 # assert round(datamodule.validation[config.continuous_cols[0]].mean()) == 0 # assert round(datamodule.validation[config.continuous_cols[0]].std()) == 1 val_loader = datamodule.val_dataloader() _val_loader = datamodule.prepare_inference_dataloader(valid) chk_1 = next(iter(val_loader))["continuous"] chk_2 = next(iter(_val_loader))["continuous"] assert np.not_equal(chk_1, chk_2).sum().item() == 0
'target' ], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented continuous_cols=num_col_names, categorical_cols=cat_col_names, ) trainer_config = TrainerConfig( auto_lr_find= True, # Runs the LRFinder to automatically derive a learning rate batch_size=1024, max_epochs=100, gpus=1, #index of the GPU to use. 0, means CPU ) optimizer_config = OptimizerConfig() model_config = CategoryEmbeddingModelConfig( task="classification", layers="1024-512-512", # Number of nodes in each layer activation="LeakyReLU", # Activation between each layers learning_rate=1e-3) tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, validation=val) result = tabular_model.evaluate(test) pred_df = tabular_model.predict(test) tabular_model.save_model("Analysis/basic") loaded_model = TabularModel.load_from_checkpoint("Analysis/basic")
def test2(nrows=10000): """ python source/models/torch_tabular.py test """ global model, session #X = np.random.rand(10000,20) #y = np.random.binomial(n=1, p=0.5, size=[10000]) BASE_DIR = Path.home().joinpath('data/input/covtype/') datafile = BASE_DIR.joinpath('covtype.data.gz') datafile.parent.mkdir(parents=True, exist_ok=True) url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz" if not datafile.exists(): wget.download(url, datafile.as_posix()) target_name = ["Covertype"] colcat = [ "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40" ] colnum = [ "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points" ] feature_columns = (colnum + colcat + target_name) df = pd.read_csv(datafile, header=None, names=feature_columns, nrows=nrows) df.head() train, test = train_test_split(df, random_state=42) train, val = train_test_split(train, random_state=42) num_classes = len(set(train[target_name].values.ravel())) data_config = DataConfig( target=target_name, continuous_cols=colnum, categorical_cols=colcat, continuous_feature_transform=None, #"quantile_normal", normalize_continuous_features=False) model_config = CategoryEmbeddingModelConfig(task="classification", metrics=["f1", "accuracy"], metrics_params=[{ "num_classes": num_classes }, {}]) trainer_config = TrainerConfig(gpus=None, fast_dev_run=True) experiment_config = ExperimentConfig( project_name="PyTorch Tabular Example", run_name="node_forest_cov", exp_watch="gradients", log_target="wandb", log_logits=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, # experiment_config=experiment_config, ) tabular_model.fit(train=train, validation=val) result = tabular_model.evaluate(val) log(result) test.drop(columns=target_name, inplace=True) pred_df = tabular_model.predict(val.iloc[:100, :]) log(pred_df)
categorical_cols=cat_col_names, continuous_feature_transform="quantile_normal", normalize_continuous_features=True ) trainer_config = TrainerConfig( auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate batch_size=1024, max_epochs=10, gpus=1, #index of the GPU to use. 0, means CPU fast_dev_run=True ) optimizer_config = OptimizerConfig() model_config = CategoryEmbeddingModelConfig( task="classification", layers="4096-4096-512", # Number of nodes in each layer activation="LeakyReLU", # Activation between each layers learning_rate = 1e-3, metrics=["accuracy"] ) tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, validation=val) test.drop(columns=['target'], inplace=True) pred_df = tabular_model.predict(test) # tabular_model.fit(train=train, validation=val)
def test_date_encoding(timeseries_data, freq): (train, test, target) = timeseries_data train, valid = train_test_split(train, random_state=42) data_config = DataConfig( target=target + ["Occupancy"], continuous_cols=["Temperature", "Humidity", "Light", "CO2", "HumidityRatio"], categorical_cols=[], date_columns=[("date", freq)], encode_date_columns=True, ) model_config_params = dict(task="regression") model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) config = tabular_model.config datamodule = TabularDatamodule( train=train, validation=valid, config=config, test=test, ) datamodule.prepare_data() if freq != "S": datamodule.setup("fit") config = datamodule.config if freq == "H": assert "_Hour" in datamodule.train.columns elif freq == "D": assert "_Dayofyear" in datamodule.train.columns elif freq == "T": assert "_Minute" in datamodule.train.columns elif freq == "S": try: datamodule.setup("fit") assert False except RuntimeError: assert True # from io import BytesIO # from urllib.request import urlopen # from zipfile import ZipFile # import numpy as np # import pandas as pd # import pytest # from sklearn.datasets import fetch_california_housing, fetch_covtype # def load_timeseries_data(): # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip" # resp = urlopen(url) # zipfile = ZipFile(BytesIO(resp.read())) # train = pd.read_csv(zipfile.open("datatraining.txt"), sep=",") # val = pd.read_csv(zipfile.open("datatest.txt"), sep=",") # test = pd.read_csv(zipfile.open("datatest2.txt"), sep=",") # return (pd.concat([train, val], sort=False), test, ["Occupancy"]) # def load_regression_data(): # dataset = fetch_california_housing(data_home="data", as_frame=True) # df = dataset.frame.sample(5000) # df["HouseAgeBin"] = pd.qcut(df["HouseAge"], q=4) # df["HouseAgeBin"] = "age_" + df.HouseAgeBin.cat.codes.astype(str) # test_idx = df.sample(int(0.2 * len(df)), random_state=42).index # test = df[df.index.isin(test_idx)] # train = df[~df.index.isin(test_idx)] # return (train, test, dataset.target_names) # test_dataloader( # load_regression_data(), # validation_split=None, # multi_target=False, # continuous_cols=[ # "AveRooms", # "AveBedrms", # "Population", # "AveOccup", # "Latitude", # "Longitude", # ], # categorical_cols=[], # continuous_feature_transform="yeo-johnson", # normalize_continuous_features=False, # target_transform=None, # embedding_dims=None # ) # test_date_encoding(load_timeseries_data(), "S") # ["H","D", "T", "S"],
def main(): # Generate Synthetic Data data, cat_col_names, num_col_names = data_load() bsize = 1024 # ##########Define the Configs############ data_config = DataConfig(target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names) trainer_config = TrainerConfig(auto_lr_find=True, batch_size=bsize, max_epochs=100, gpus=1) optimizer_config = OptimizerConfig() model_config = CategoryEmbeddingModelConfig(task="classification", layers="1024-512-512", activation="LeakyReLU", learning_rate=1e-3) tabular_mode = TabularModel(data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config) # Training the Model # tabular_mode.fit(train=train, validation=val) # # Evaluating the Model # # #Loss and Metrics on New Data¶ # result = tabular_mode.evaluate(test) cv = StratifiedKFold(n_splits=10, shuffle=True) res_pred = [] for train_idx, test_idx in cv.split(X=data, y=data.target.values): train, test = data.iloc[train_idx], data.iloc[test_idx] train, val = train_test_split(train, random_state=42) tabular_mode = TabularModel(data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config) weighted_loss = get_class_weighted_cross_entropy( train["target"].values.ravel(), mu=0.1) # Training the Model tabular_mode.fit(train=train, validation=val, max_epochs=100, loss=weighted_loss) pred_df = tabular_mode.predict(test).loc[:, ["prediction"]] res_pred.append(pred_df) # #New Predictions as DataFrame pred_tot = pd.concat(res_pred).sort_index() print_metrics(data['target'], pred_tot["prediction"], tag="Holdout") confusion_matrix(data['target'], pred_tot["prediction"]) # saving model tabular_mode.save_model("Analysis/basic")
categorical_cols=cat_col_names, continuous_feature_transform="quantile_normal", normalize_continuous_features=True) trainer_config = TrainerConfig( auto_lr_find= True, # Runs the LRFinder to automatically derive a learning rate batch_size=1024, max_epochs=1000, gpus=1, #index of the GPU to use. 0, means CPU ) optimizer_config = OptimizerConfig() model_config = CategoryEmbeddingModelConfig( task="classification", layers="4096-4096-512", # Number of nodes in each layer activation="LeakyReLU", # Activation between each layers learning_rate=1e-3, metrics=["accuracy", "f1"], metrics_params=[{}, { "average": "micro" }]) tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) # %% tabular_model.fit(train=train, test=test) # %%