""" Example: distilling AutoGluon's ensemble-predictor into a single model for binary classification. """

# NOTE: Distillation can be done in a similar manner for multiclass classification and regression problems.
# NOTE: To distill CatBoost models in multiclass classification, you need to first run:  pip install catboost-dev

from autogluon.tabular import TabularDataset, TabularPredictor

subsample_size = 500
time_limit = 60

label = 'class'  # specifies which column do we want to predict
train_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
test_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'

train_data = TabularDataset(train_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo

test_data = TabularDataset(test_file_path)
test_data = test_data.head(subsample_size)  # subsample for faster run

# Fit model ensemble:
predictor = TabularPredictor(label).fit(train_data,
                                        auto_stack=True,
                                        time_limit=time_limit)

# Distill ensemble-predictor into single model:

time_limit = 60  # set = None to fully train distilled models

# aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here:
aug_data = TabularDataset(train_file_path)
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'ag_models/'  # where to save trained models

predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()

# Inference time:
test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
"""

################
# Loading Data #
################

from autogluon.tabular import TabularDataset, TabularPredictor

train_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
test_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv'
)  # another Pandas DataFrame
label = 'class'  # specifies which column do we want to predict
sample_train_data = train_data.head(100)  # subsample for faster demo

# Separate features and labels
# Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well.
X = sample_train_data.drop(columns=[label])
y = sample_train_data[label]

X_test = test_data.drop(columns=[label])
y_test = test_data[label]

print(X)

##############################
# Fitting feature generators #
##############################