def train(args): is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # Extract column info # target = 'y' columns = train_data.columns.tolist() column_dict = {"columns": columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) # Train models predictor = task.fit(train_data=train_data, output_directory=args.model_dir, label='y' # **args.fit_args, ) # Results summary predictor.fit_summary(verbosity=1) # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring if args.fit_args['label'] in test_data: # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(dataset=test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) print(predictor.feature_importance(test_data)) else: warnings.warn( 'Skipping eval on test data since label column is not included.' ) # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
def test_advanced_functionality(): fast_benchmark = True dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY} label = 'class' directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if fast_benchmark: # subsample for fast_benchmark subsample_size = 100 train_data = train_data.head(subsample_size) test_data = test_data.head(subsample_size) print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}") directory = directory_prefix + 'advanced/' + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = task.fit(train_data=train_data, label=label, output_directory=savedir) leaderboard = predictor.leaderboard(dataset=test_data) leaderboard_extra = predictor.leaderboard(dataset=test_data, extra_info=True) assert set(predictor.get_model_names()) == set(leaderboard['model']) assert set(predictor.get_model_names()) == set(leaderboard_extra['model']) assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns)) assert len(leaderboard) == len(leaderboard_extra) num_models = len(predictor.get_model_names()) feature_importances = predictor.feature_importance(dataset=test_data) original_features = set(train_data.columns) original_features.remove(label) assert(set(feature_importances.keys()) == original_features) predictor.transform_features() predictor.transform_features(dataset=test_data) predictor.info() assert(predictor.get_model_full_dict() == dict()) predictor.refit_full() assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) for model in predictor.get_model_names(): predictor.predict(dataset=test_data, model=model) predictor.refit_full() # Confirm that refit_models aren't further refit. assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) predictor.delete_models(models_to_keep=[]) # Test that dry-run doesn't delete models assert(len(predictor.get_model_names()) == num_models * 2) predictor.predict(dataset=test_data) predictor.delete_models(models_to_keep=[], dry_run=False) # Test that dry-run deletes models assert len(predictor.get_model_names()) == 0 assert len(predictor.leaderboard()) == 0 assert len(predictor.leaderboard(extra_info=True)) == 0 try: predictor.predict(dataset=test_data) except: pass else: raise AssertionError('predictor.predict should raise exception after all models are deleted') print('Tabular Advanced Functionality Test Succeeded.')
def train(self, data, params): self.data = data self.train_data = task.Dataset(data.unscaled_df) autogluon_dir = f'agModels-predictClass/{uuid.uuid4()}' # specifies folder where to store trained models self.predictor = task.fit(train_data=self.train_data, label=self.metadata.get("output")[0], output_directory=autogluon_dir) self.state = "TRAINED"
def run(self, train_path, test_path, target, task): train_data = task.Dataset(file_path=train_path) predictor = task.fit(train_data=train_data, label=label_column, eval_metric="f1_macro", num_bagging_folds=5) test_data = task.Dataset(file_path=test_path) y_test = test_data[target] y_pred = predictor.predict(test_data) return predictor.evaluate_predictions(y_true=y_test.to_numpy(), y_pred=y_pred, auxiliary_metrics=True)
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir target = args.label_column train_file_path = get_file_path(args.train, args.train_filename) train_data = task.Dataset(file_path= train_file_path ) subsample_size = int(args.train_rows) # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir) return predictor
def run_tabular_benchmark_toy(fit_args): dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip', 'name': 'toyClassification', 'problem_type': MULTICLASS, 'label_column': 'y', 'performance_val': 0.436 } # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data # toyclassif_dataset should produce 1 warning and 1 error during inference: # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn'] # Additional warning that would have occurred if ValueError was not triggered: # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them): ['distractioncolumn1', 'distractioncolumn2'] directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) print(f"Evaluating Benchmark Dataset {dataset['name']}") directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = task.fit(train_data=train_data, label=dataset['label_column'], output_directory=savedir, **fit_args) print(predictor.feature_metadata) print(predictor.feature_metadata.type_map_raw) print(predictor.feature_metadata.type_group_map_special) try: predictor.predict(test_data) except KeyError: # KeyError should be raised because test_data has missing column 'lostcolumn' pass else: raise AssertionError(f'{dataset["name"]} should raise an exception.')
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir target = args.target # load training and validation data training_dir = args.train filename = args.filename logging.info(training_dir) train_data = task.Dataset(file_path=training_dir + '/' + filename) predictor = task.fit(train_data=train_data, label=target, output_directory=model_dir) return predictor
def train_regression_autogluon(args, train_df, test_df): mx.npx.reset_np() from autogluon import TabularPrediction as task predictor = task.fit(train_data=task.Dataset(df=train_df), output_directory=args.out_dir, label='thrpt', eval_metric='mean_absolute_error') #performance = predictor.evaluate(test_df) test_prediction = predictor.predict(test_df) ret = np.zeros((len(test_prediction), 2), dtype=np.float32) for i, (lhs, rhs) in enumerate(zip(test_df['thrpt'].to_numpy(), test_prediction)): ret[i][0] = lhs ret[i][1] = rhs df_result = pd.DataFrame(ret, columns=['gt', 'pred']) df_result.to_csv(os.path.join(args.out_dir, 'pred_result.csv')) plot_save_figure(gt_thrpt=test_df['thrpt'].to_numpy(), pred_thrpt=test_prediction, save_dir=args.out_dir) mx.npx.set_np()
def Load_GLUON(dataDownstream, dataFeaturized): df = pd.DataFrame(columns=['column', 'feature_type']) df.to_csv('AutoGluon_predictions.csv', index=False) # dataDownstream train = copy.deepcopy(dataDownstream) train['label_target'] = 1 train_data = task.Dataset(df=train) label_column = 'label_target' try: features = task.fit(train_data=train_data, label=label_column) except: AlwaysTrue = 1 agl_predictions = pd.read_csv('AutoGluon_predictions.csv') predictions = agl_predictions['feature_type'].values.tolist() return predictions
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir target = args.target # load training and validation data training_dir = args.train filename = args.filename logging.info(training_dir) hyperparameters = { 'GBM': [ {}, { 'extra_trees': True, 'AG_args': { 'name_suffix': 'XT' } }, ], 'RF': {}, 'XT': {}, 'KNN': {}, 'custom': ['GBM'] } presets = 'medium_quality_faster_train' train_data = task.Dataset(file_path=training_dir + '/' + filename) predictor = task.fit(train_data=train_data, label=target, output_directory=model_dir, presets=presets, hyperparameters=hyperparameters) return predictor
def frc_AutoGluon(df_train, df_test, categoricalVars, responseVar = 'wk1_sales_all_stores'): import autogluon as ag from autogluon import TabularPrediction as task for varName in categoricalVars: df_train[varName] = df_train[varName].astype(str) df_test[varName] = df_test[varName].astype(str) # AutoGluon format train_data = task.Dataset(df=df_train) test_data = task.Dataset(df=df_test) model = task.fit(train_data=train_data, output_directory="auto_gluon", label=responseVar, hyperparameter_tune=False) # Forecast with the best model autogluon_frc = model.predict(test_data) return {'autoGluon_frc': autogluon_frc, 'autoGluon_model':model}
print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip") train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = test_data.head(subsample_size) # subsample for faster run label_column = dataset['label_column'] # Fit model ensemble: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, cache_data=True, auto_stack=True, time_limits=time_limits, eval_metric='mean_absolute_error') # Distill ensemble-predictor into single model: time_limits = 60 # set = None to fully train distilled models # aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here: aug_data = task.Dataset(file_path=train_file_path) aug_data = aug_data.head(subsample_size) # subsample for faster demo distilled_model_names = predictor.distill( time_limits=time_limits, augment_args={'num_augmented_samples': 100} ) # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None): print("Running fit with args:") print(fit_args) # Each train/test dataset must be located in single directory with the given names. train_file = 'train_data.csv' test_file = 'test_data.csv' EPS = 1e-10 # Information about each dataset in benchmark is stored in dict. # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks binary_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY, 'label_column': 'class', 'performance_val': 0.129} # Mixed types of features. multi_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip', 'name': 'CoverTypeMulticlassClassification', 'problem_type': MULTICLASS, 'label_column': 'Cover_Type', 'performance_val': 0.032} # big dataset with 7 classes, all features are numeric. Runs SLOW. regression_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AmesHousingPriceRegression.zip', 'name': 'AmesHousingPriceRegression', 'problem_type': REGRESSION, 'label_column': 'SalePrice', 'performance_val': 0.076} # Regression with mixed feature-types, skewed Y-values. toyregres_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyRegression.zip', 'name': 'toyRegression', 'problem_type': REGRESSION, 'label_column': 'y', 'performance_val': 0.183} # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data toyclassif_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyClassification.zip', 'name': 'toyClassification', 'problem_type': MULTICLASS, 'label_column': 'y', 'performance_val': 0.436} # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data # toyclassif_dataset should produce 3 warnings: # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them): ['distractioncolumn1', 'distractioncolumn2'] # UserWarning: The columns listed below from the training data are no longer in the given dataset. (AutoGluon will proceed assuming their values are missing, but you should remove these columns from training dataset and train a new model): ['lostcolumn'] # UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. # List containing dicts for each dataset to include in benchmark (try to order based on runtimes) datasets = [toyregres_dataset, toyclassif_dataset, binary_dataset, regression_dataset, multi_dataset] if dataset_indices is not None: # only run some datasets datasets = [datasets[i] for i in dataset_indices] # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len(datasets) # performance obtained in this run directory_prefix = './datasets/' if not os.path.exists(directory_prefix): os.mkdir(directory_prefix) with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): if seed_val is not None: seed(seed_val) np.random.seed(seed_val) mx.random.seed(seed_val) dataset = datasets[idx] print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets))) directory = directory_prefix + dataset['name'] + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) zip_name = ag.download(dataset['url'], directory_prefix) ag.unzip(zip_name, directory_prefix) os.remove(zip_name) savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label_column = dataset['label_column'] train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column], axis=1) if fast_benchmark: if subsample_size is None: raise ValueError("fast_benchmark specified without subsample_size") train_data = train_data.head(subsample_size) # subsample for fast_benchmark predictor = None # reset from last Dataset predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, **fit_args) results = predictor.fit_summary(verbosity=0) if predictor.problem_type != dataset['problem_type']: warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = None # We delete predictor here to test loading previously-trained predictor from file predictor = task.load(savedir) y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict['accuracy_score'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict['r2_score'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn("Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val']))) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance))) print("Ran fit with args:") print(fit_args) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)
import pandas as pd import autogluon.core as ag from autogluon import TabularPrediction as task from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score #autogluon label_column = 'test' dir = 'agModels-predictClass_jiagnwei' train_data = task.Dataset(file_path="/dataset/jiangweitrai.csv") test_data = task.Dataset(file_path="/dataset/jiangweitrai.csv") # TODO predictor = task.fit(train_data=train_data, label='test', output_directory=dir, auto_stack=True, time_limits=1800) results = predictor.fit_summary() print(predictor.feature_importance(dataset=test_data, subsample_size=None)) # predictor = task.load(dir) # print(predictor.info()) # print(predictor.feature_importance(dataset=train_data))
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon import TabularPrediction as task # Training time: train_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label_column = 'class' # specifies which column do we want to predict savedir = 'ag_models/' # where to save trained models predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, presets='best_quality', eval_metric=YOUR_METRIC_NAME) results = predictor.fit_summary() # Inference time: test_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column],axis=1) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = task.load(savedir) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
def train(): print('Starting the training.') try: # Read in any hyperparameters that the user passed with the training job with open(param_path, 'r') as tc: trainingParams = json.load(tc) # Take the set of files and read them all into a single pandas dataframe input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ] if len(input_files) == 0: raise ValueError(('There are no files in {}.\n' + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 'the data specification in S3 was incorrectly specified or the role specified\n' + 'does not have permission to access the data.').format(training_path, channel_name)) print('Found x number of files') all_model_df = pd.read_csv(os.path.join(training_path,'all_model_df.csv')) y5 = np.load(os.path.join(training_path,'y5.npy')) kf = StratifiedKFold(n_splits = 5) f = 0 # for each cancer in tcga for c in tqdm(np.unique(y5)): for train_index, test_index in kf.split(all_model_df,y5): t1 = time.time() print(c,"starting fold",f) # load prev.pickle with most important biomarkers with open(os.path.join(training_path,"c"+str(c)+"_f"+str(f)+"_5hsic5adasynlgbm100ft.b"), "rb") as fp: train_index,test_index,chsicpredictor,predy,acc = pickle.load(fp) c_idx = np.where(y5==c)[0] cy = np.zeros_like(y5) cy[c_idx] = 1 # train an ensemble model with AutoML to maximize accuracy train_data = all_model_df.iloc[train_index].iloc[:, chsicpredictor.hsic_idx_] train_data["label"] = cy[train_index] clf = task.fit(train_data, label="label", presets='best_quality', auto_stack=True, output_directory="_autogluon_c_"+str(c)+"_f"+str(f)) test_y = y5[test_index] c_idx = np.where(test_y==c)[0] test_y = np.zeros_like(test_y) test_y[c_idx] = 1 bpredy = clf.predict(all_model_df.iloc[test_index].iloc[:, chsicpredictor.hsic_idx_]) bacc = accuracy_score(test_y, bpredy) print("done in ",time.time()-t1,"acc",acc) # save the results model_file_name = "AutoML_c"+str(c)+"_f"+str(f)+"_5hsic5adasynlgbm100ft.b" acc_file_name = "AutoML_c"+str(c)+"_f"+str(f)+"_acc.b" with open(os.path.join(model_path,model_file_name), "wb") as fp: pickle.dump((train_index,test_index,chsicpredictor,predy,acc,bpredy,bacc,clf),fp) with open(os.path.join(model_path,acc_file_name), "wb") as fp: pickle.dump((bacc),fp) f+=1 print('Training complete.') except Exception as e: # Write out an error file. This will be returned as the failureReason in the # DescribeTrainingJob result. trc = traceback.format_exc() with open(os.path.join(output_path, 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) # Printing this causes the exception to be in the training job logs, as well. print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) # A non-zero exit code causes the training job to be marked as Failed. sys.exit(255)
'num_epochs': 10, 'activation': 'relu', 'dropout_prob': ag.Real(0.0, 0.5) }, 'GBM': { 'num_boost_round': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) } } predictor = task.fit( train_data=train_data, label=label_column, output_directory=savedir, hyperparameter_tune=True, hyperparameters=hyperparams, num_trials=5, time_limits=1 * 60, num_bagging_folds=0, stack_ensemble_levels=0 ) # since tuning_data = None, automatically determines train/validation split results = predictor.fit_summary() # display detailed summary of fit() process # Inference time: test_data = task.Dataset( file_path= 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv' ) # another Pandas DataFrame print(test_data.head())
class_order) = autoweka_fit_predict(train_data=train_data, test_data=test_data, label_column=label_column, problem_type=problem_type, output_directory=output_directory, autoweka_path=autoweka_path, eval_metric=eval_metric, runtime_sec=runtime_sec, random_state=random_state, num_cores=num_cores) # Can use autogluon.tabular.Predictor to evaluate predictions (assuming metric correctly specified): ag_predictor = task.fit(task.Dataset(df=train_data), label=label_column, problem_type=problem_type, eval_metric=eval_metric, hyperparameters={'GBM': { 'num_boost_round': 2 }}) if eval_metric == 'roc_auc': preds_toevaluate = y_prob[:, 1] elif eval_metric == 'log_loss': preds_toevaluate = y_prob else: preds_toevaluate = y_pred perf = ag_predictor.evaluate_predictions( test_data[label_column], preds_toevaluate ) # use y_prob or y_prob[:,1] instead of y_pred for metrics like log_loss or roc_auc print("Auto-WEKA test performance: %s" % perf)
from autogluon import TabularPrediction as task from data_config.data_config import load_data, data_config if __name__ == '__main__': res = {} for data_name in data_config.keys(): ylabel = data_config[data_name]['ylabel'] X_train, X_valid = load_data(data_name, combine_y=True) train_data = task.Dataset(df=X_train) test_data = task.Dataset(df=X_valid) savedir = f'{data_name}/' # where to save trained models predictor = task.fit( train_data=train_data, label=ylabel, output_directory=savedir, eval_metric='roc_auc', verbosity=2, visualizer='tensorboard', random_seed=0, save_space=True, keep_only_best=True, ) auc = predictor.evaluate(X_valid) res[data_name] = auc print(res) import pickle with open('autogluon_result.pickle', 'wb') as f: pickle.dump(res, f)
label_column = 'status' dir = 'agModels-predictClass' # specifies folder where to store trained models # print(train_data.head(10)) # print(train_data.info()) # print(train_data.describe()) if __name__ == '__main__': # predictor = task.fit(train_data=train_data, label=label_column, output_directory=dir, time_limits=100) # results = predictor.fit_summary() # print("AutoGluon infers problem type is: ", predictor.problem_type) # print("AutoGluon identified the following types of features:") # print(predictor.feature_metadata) # # predictor.leaderboard(train_data, silent=True) # # print(results) time_limits = 60 # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds) metric = 'roc_auc' # specify your evaluation metric here predictor = task.fit(train_data=train_data, label=label_column, time_limits=time_limits) results = predictor.fit_summary() # print("AutoGluon infers problem type is: ", predictor.problem_type) # print("AutoGluon identified the following types of features:") # print(predictor.feature_metadata) # results.to_csv('111.csv') # data_utils.data_to_excel(results) # predictor.leaderboard(train_data, silent=True) # print(results)
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None, run_distill=False): print("Running fit with args:") print(fit_args) # Each train/test dataset must be located in single directory with the given names. train_file = 'train_data.csv' test_file = 'test_data.csv' EPS = 1e-10 # Information about each dataset in benchmark is stored in dict. # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks binary_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY, 'label_column': 'class', 'performance_val': 0.129 } # Mixed types of features. multi_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip', 'name': 'CoverTypeMulticlassClassification', 'problem_type': MULTICLASS, 'label_column': 'Cover_Type', 'performance_val': 0.032 } # big dataset with 7 classes, all features are numeric. Runs SLOW. regression_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/AmesHousingPriceRegression.zip', 'name': 'AmesHousingPriceRegression', 'problem_type': REGRESSION, 'label_column': 'SalePrice', 'performance_val': 0.076 } # Regression with mixed feature-types, skewed Y-values. toyregres_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip', 'name': 'toyRegression', 'problem_type': REGRESSION, 'label_column': 'y', 'performance_val': 0.183 } # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data # List containing dicts for each dataset to include in benchmark (try to order based on runtimes) datasets = [ toyregres_dataset, binary_dataset, regression_dataset, multi_dataset ] if dataset_indices is not None: # only run some datasets datasets = [datasets[i] for i in dataset_indices] # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len( datasets) # performance obtained in this run directory_prefix = './datasets/' with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): dataset = datasets[idx] train_data, test_data = load_data( directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if seed_val is not None: seed(seed_val) np.random.seed(seed_val) mx.random.seed(seed_val) print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx + 1, len(datasets))) directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label_column = dataset['label_column'] y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column], axis=1) if fast_benchmark: if subsample_size is None: raise ValueError( "fast_benchmark specified without subsample_size") train_data = train_data.head( subsample_size) # subsample for fast_benchmark predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, **fit_args) results = predictor.fit_summary(verbosity=0) if predictor.problem_type != dataset['problem_type']: warnings.warn( "For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = task.load( savedir) # Test loading previously-trained predictor from file y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict[ 'accuracy_score'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict[ 'r2_score'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and ( performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn( "Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx] / (EPS + dataset['performance_val']))) if run_distill: predictor.distill(time_limits=60, augment_args={'size_factor': 0.5}) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn( "Average Performance is %s times worse than previously." % (avg_perf / (EPS + previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn( "Median Performance is %s times worse than previously." % (median_perf / (EPS + previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn( "Worst Performance is %s times worse than previously." % (worst_perf / (EPS + previous_worst_performance))) print("Ran fit with args:") print(fit_args) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)
else: excluded_model_types = [] # Create output directory pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) (X_train, y_train), (X_valid, y_valid) = load_data(use_test=False) df_train = convert_to_dataframe(X_train, y_train) df_valid = convert_to_dataframe(X_valid, y_valid) predictor = task.fit( train_data=task.Dataset(df=df_train), tuning_data=task.Dataset(df=df_valid), label="label", output_directory=output_dir, time_limits=args.walltime, hyperparameter_tune=True, auto_stack=True, excluded_model_types=excluded_model_types, ) else: _, (X_test, y_test) = load_data(use_test=True) print("Convert arrays to DataFrame...") df_test = convert_to_dataframe(X_test, y_test) print("Loading models...") predictor = task.load(output_dir, verbosity=4) print("Predicting...") t1 = time.time()
" -O temp.zip && unzip -o temp.zip && rm temp.zip") savedir = directory + 'agModels/' label_column = dataset['label_column'] train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = test_data.head(subsample_size) # subsample for faster run print(train_data.head()) # Fit model ensemble: predictor = task.fit(train_data=train_data, label=label_column, problem_type='multiclass', output_directory=savedir, cache_data=True, auto_stack=True, time_limits=time_limits) # Distill ensemble-predictor into single model: time_limits = 60 # None # aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here: aug_data = task.Dataset(file_path=train_file_path) aug_data = aug_data.head(subsample_size) # subsample for faster demo distilled_model_names = predictor.distill( time_limits=time_limits, augment_args={'num_augmented_samples': 100} ) # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)
def test_tabularHPO(): # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len( datasets) # performance obtained in this run with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): seed(seed_val) np.random.seed(seed_val) mx.random.seed(seed_val) dataset = datasets[idx] print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx + 1, len(datasets))) directory = dataset['name'] + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or ( not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip") savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label_column = dataset['label_column'] train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column], axis=1) if fast_benchmark: train_data = train_data.head( subsample_size) # subsample for fast_benchmark predictor = None # reset from last Dataset if fast_benchmark: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, hyperparameter_tune=hyperparameter_tune, hyperparameters=hyperparameters, time_limits=time_limits, num_trials=num_trials, verbosity=verbosity) else: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, hyperparameter_tune=hyperparameter_tune, verbosity=verbosity) results = predictor.fit_summary(verbosity=0) if predictor.problem_type != dataset['problem_type']: warnings.warn( "For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = None # We delete predictor here to test loading previously-trained predictor from file predictor = task.load(savedir) y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict[ 'accuracy_score'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict[ 'r2_score'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and ( performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn( "Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx] / (EPS + dataset['performance_val']))) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn( "Average Performance is %s times worse than previously." % (avg_perf / (EPS + previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn( "Median Performance is %s times worse than previously." % (median_perf / (EPS + previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn( "Worst Performance is %s times worse than previously." % (worst_perf / (EPS + previous_worst_performance))) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)
def processData(data, label_column=None, output_directory=None, ag_predictor=None, problem_type=None, eval_metric=None): """ Converts pandas Dataframe to matrix of entirely numerical values (stored in DataFrame). Performs same data preprocessing as used for AutoGluon's tabular neural network model, to deal with issues such as: missing value imputation, one-hot encoding of categoricals, handling of high-cardinality categoricals, handling unknown categorical feature-levels at test-time, etc. If ag_predictor is not None, uses existing autogluon predictor object to process data (must have tabularNN as first model). To process training data, ag_predictor should = None. For test data, should != None. Returns: Tuple (X, y, ag_predictor) where y may be None if labels are not present in test data. """ # fit dummy neural network model just to preprocess data. Here we ensure no embedding layers are used. if ag_predictor is None: if label_column is None: raise ValueError( "when processing training data, label_column cannot be None") elif not label_column in data.columns: raise ValueError( "label_column cannot be missing from training data") ag_predictor = task.fit(train_data=task.Dataset(data), tuning_data=task.Dataset(data), label=label_column, hyperparameter_tune=False, problem_type=problem_type, eval_metric=eval_metric, hyperparameters={ 'NN': { 'num_epochs': 0, 'proc.embed_min_categories': np.inf } }, num_bagging_folds=0, stack_ensemble_levels=0, label_count_threshold=1, verbosity=2, feature_generator_kwargs={ 'enable_nlp_vectorizer_features': False, 'enable_nlp_ratio_features': False }) model = ag_predictor._trainer.load_model( ag_predictor._trainer.get_model_names_all() [0]) # This must be the neural net model which contains data processor if 'NeuralNet' not in model.name: raise ValueError( "Data preprocessing error. This model should be the NeuralNet, not the: %s" % model.name) bad_inds = [] # row-indices to remove from dataset if label_column is not None and label_column in data.columns: label_cleaner = ag_predictor._learner.label_cleaner y = data[label_column].values data = data.drop([label_column], axis=1, inplace=False) y = label_cleaner.transform(y) if np.sum(y.isna()) > 0: bad_inds = y.index[y.apply(np.isnan)].tolist( ) # remove these inds as label is NaN (due to very rare classes) warnings.warn( "Dropped these rows from data in preprocessing, due to missing labels: " + str(bad_inds)) else: y = None data_initial_processed = ag_predictor._learner.transform_features( data) # general autogluon data processing. # data_fg = ag_predictor._learner.general_data_processing(X=data, X_test=data, holdout_frac=0.0, num_bagging_folds=0) tabNN_data = model.process_data( data_initial_processed, is_test=True ) # neural net-specific autogluon data processing required to turn tabular data into numerical matrix. numeric_data = tabNN_data.dataset._data # list of mxnet.NDArrays if len(numeric_data) != 1: raise ValueError("Data Preprocessing failed.") numpy_data = numeric_data[0].asnumpy() # 2D Numpy array X = pd.DataFrame(numpy_data) X.columns = ['feature' + str(i) for i in range(X.shape[1])] if len(bad_inds) > 0: y.drop(index=bad_inds, inplace=True) X.drop(index=bad_inds, axis=0, inplace=True) return (X, y, ag_predictor)
def test_advanced_functionality(): fast_benchmark = True dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY } label = 'class' directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if fast_benchmark: # subsample for fast_benchmark subsample_size = 100 train_data = train_data.head(subsample_size) test_data = test_data.head(subsample_size) print( f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}" ) directory = directory_prefix + 'advanced/' + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = task.fit(train_data=train_data, label=label, output_directory=savedir) leaderboard = predictor.leaderboard(dataset=test_data) leaderboard_extra = predictor.leaderboard(dataset=test_data, extra_info=True) assert set(predictor.get_model_names()) == set(leaderboard['model']) assert set(predictor.get_model_names()) == set(leaderboard_extra['model']) assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns)) assert len(leaderboard) == len(leaderboard_extra) num_models = len(predictor.get_model_names()) feature_importances = predictor.feature_importance(dataset=test_data) original_features = set(train_data.columns) original_features.remove(label) assert (set(feature_importances.keys()) == original_features) predictor.transform_features() predictor.transform_features(dataset=test_data) predictor.info() assert predictor.get_model_names_persisted() == [ ] # Assert that no models were persisted during training assert predictor.unpersist_models() == [ ] # Assert that no models were unpersisted persisted_models = predictor.persist_models(models='all', max_memory=None) assert set(predictor.get_model_names_persisted()) == set( persisted_models) # Ensure all models are persisted assert predictor.persist_models(models='all', max_memory=None) == [ ] # Ensure that no additional models are persisted on repeated calls unpersised_models = predictor.unpersist_models() assert set(unpersised_models) == set(persisted_models) assert predictor.get_model_names_persisted() == [ ] # Assert that all models were unpersisted # Raise exception with pytest.raises(NetworkXError): predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) assert predictor.get_model_names_persisted() == [] assert predictor.unpersist_models( models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == [] predictor.persist_models(models='all', max_memory=None) predictor.save( ) # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded. predictor_loaded = TabularPredictor.load( output_directory=predictor.output_directory ) # Assert that predictor loading works leaderboard_loaded = predictor_loaded.leaderboard(dataset=test_data) assert len(leaderboard) == len(leaderboard_loaded) assert predictor_loaded.get_model_names_persisted() == [ ] # Assert that models were not still persisted after loading predictor assert (predictor.get_model_full_dict() == dict()) predictor.refit_full() assert (len(predictor.get_model_full_dict()) == num_models) assert (len(predictor.get_model_names()) == num_models * 2) for model in predictor.get_model_names(): predictor.predict(dataset=test_data, model=model) predictor.refit_full() # Confirm that refit_models aren't further refit. assert (len(predictor.get_model_full_dict()) == num_models) assert (len(predictor.get_model_names()) == num_models * 2) predictor.delete_models( models_to_keep=[]) # Test that dry-run doesn't delete models assert (len(predictor.get_model_names()) == num_models * 2) predictor.predict(dataset=test_data) predictor.delete_models(models_to_keep=[], dry_run=False) # Test that dry-run deletes models assert len(predictor.get_model_names()) == 0 assert len(predictor.leaderboard()) == 0 assert len(predictor.leaderboard(extra_info=True)) == 0 try: predictor.predict(dataset=test_data) except: pass else: raise AssertionError( 'predictor.predict should raise exception after all models are deleted' ) print('Tabular Advanced Functionality Test Succeeded.')
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir # target = args.label # presets = args.presets # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) columns = train_data.columns.tolist() column_dict = {"columns": columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) subsample_size = int( args.train_rows ) # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) # predictor = task.fit(train_data = train_data, label=target, # output_directory=model_dir, # presets = presets) # Train models predictor = task.fit( train_data=train_data, output_directory=model_dir, **args.fit_args, ) # Results summary predictor.fit_summary(verbosity=1) # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(dataset=test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) print(predictor.feature_importance(test_data)) # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n") return predictor
for seed in seeds: with mlflow.start_run(run_name='autogluon'): # Create output directory for auto gluon models_dir = 'AutogluonModels' random_dir = ''.join(random.choices(string.ascii_uppercase + string.digits, k = 12)) output_dir = f'{models_dir}{os.sep}{random_dir}' os.mkdir(output_dir) # Split data into two parts (train, valid) train, valid = train_test_split(data, random_state = seed) predictor = task.fit(train_data=train, label=target_column, problem_type = 'regression', eval_metric = custom_metric, stopping_metric=custom_metric, hyperparameters= hyper_parameters, stack_ensemble_levels=2, time_limits = run_time_secs, cache_data=False, verbosity = 2, output_directory=output_dir) test_data = valid y_test = test_data[target_column] # values to predict test_data_nolab = test_data.drop(labels=[target_column],axis=1) # delete label column to prove we're not cheating # AutoGluon will gauge predictive performance using # evaluation metric: roc_auc this metric expects predicted probabilities # rather than predicted class labels, so you'll need to use predict_proba() # instead of predict() y_pred = predictor.predict_proba(test_data_nolab) score = RMSLE(y_test,y_pred) mlflow.log_metric('RMSLE', score)
def train(args): model_output_dir = f'{args.output_dir}/data' is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # Extract column info target = args.fit_args['label'] columns = train_data.columns.tolist() column_dict = {"columns":columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) # Train models predictor = task.fit( train_data=train_data, output_directory=args.model_dir, **args.fit_args, ) # Results summary predictor.fit_summary(verbosity=3) model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html') model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html') if os.path.exists(model_summary_fname_src): shutil.copy(model_summary_fname_src, model_summary_fname_tgt) # ensemble visualization G = predictor._trainer.model_graph remove = [node for node,degree in dict(G.degree()).items() if degree < 1] G.remove_nodes_from(remove) A = nx.nx_agraph.to_agraph(G) A.graph_attr.update(rankdir='BT') A.node_attr.update(fontsize=10) for node in A.iternodes(): node.attr['shape'] = 'rectagle' A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot') # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring if args.fit_args['label'] in test_data: # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(dataset=test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False) # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) feature_importance = predictor.feature_importance(test_data) feature_importance_df = pd.DataFrame(feature_importance, columns=['Importance score']).rename_axis(index='Feature') print(feature_importance_df) feature_importance_df.to_csv(f'{model_output_dir}/feature_importance.csv', index=True) # Classification report and confusion matrix for classification model if predictor.problem_type in [BINARY, MULTICLASS]: from sklearn.metrics import classification_report, confusion_matrix X_test = test_data.drop(args.fit_args['label'], axis=1) y_test_true = test_data[args.fit_args['label']] y_test_pred = predictor.predict(X_test) y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True) report_dict = classification_report(y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels) report_dict_df = pd.DataFrame(report_dict).T report_dict_df.to_csv(f'{model_output_dir}/classification_report.csv', index=True) cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels) cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels) sns.set(font_scale=1) cmap = 'coolwarm' sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap) plt.title('Confusion Matrix') plt.ylabel('true label') plt.xlabel('predicted label') plt.show() plt.savefig(f'{model_output_dir}/confusion_matrix.png') get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir) else: warnings.warn('Skipping eval on test data since label column is not included.') # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
for data_name in data_config.keys(): hyperparameters = {'NN': {}} ylabel = data_config[data_name]['ylabel'] X_train, X_valid = load_data(data_name, combine_y=True) train_data = task.Dataset(df=X_train) test_data = task.Dataset(df=X_valid) savedir = f'{data_name}/' # where to save trained models predictor = task.fit( train_data=train_data, label=ylabel, output_directory=savedir, eval_metric='roc_auc', stack_ensemble_levels=0, # auto_stack=True, num_bagging_folds=5, verbosity=2, visualizer='tensorboard', random_seed=0, save_space=True, keep_only_best=True, hyperparameters=hyperparameters) auc = predictor.evaluate(X_valid) res[data_name] = auc print(res) import pickle with open('WideDeep_result.pickle', 'wb') as f: pickle.dump(res, f)