""" Example: distilling AutoGluon's ensemble-predictor into a single model for binary classification. """ # NOTE: Distillation can be done in a similar manner for multiclass classification and regression problems. # NOTE: To distill CatBoost models in multiclass classification, you need to first run: pip install catboost-dev from autogluon.tabular import TabularDataset, TabularPredictor subsample_size = 500 time_limit = 60 label = 'class' # specifies which column do we want to predict train_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv' test_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' train_data = TabularDataset(train_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = TabularDataset(test_file_path) test_data = test_data.head(subsample_size) # subsample for faster run # Fit model ensemble: predictor = TabularPredictor(label).fit(train_data, auto_stack=True, time_limit=time_limit) # Distill ensemble-predictor into single model: time_limit = 60 # set = None to fully train distilled models # aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here: aug_data = TabularDataset(train_file_path)
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label = 'class' # specifies which column do we want to predict save_path = 'ag_models/' # where to save trained models predictor = TabularPredictor(label=label, path=save_path).fit(train_data) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: # predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality') results = predictor.fit_summary() # Inference time: test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame y_test = test_data[label] test_data = test_data.drop(labels=[label], axis=1) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = TabularPredictor.load(save_path) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
""" ################ # Loading Data # ################ from autogluon.tabular import TabularDataset, TabularPredictor train_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv' ) # can be local CSV file as well, returns Pandas DataFrame test_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv' ) # another Pandas DataFrame label = 'class' # specifies which column do we want to predict sample_train_data = train_data.head(100) # subsample for faster demo # Separate features and labels # Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well. X = sample_train_data.drop(columns=[label]) y = sample_train_data[label] X_test = test_data.drop(columns=[label]) y_test = test_data[label] print(X) ############################## # Fitting feature generators # ##############################