def get_dataset(args): # built-in dataset (voc) if 'voc' in args.dataset_name: logging.info('Please follow this instruction to download dataset: \ https://gluon-cv.mxnet.io/build/examples_datasets/pascal_voc.html#sphx-glr-build-examples-datasets-pascal-voc-py ') train_dataset = task.Dataset(name=args.dataset_name) test_dataset = task.Dataset(name=args.dataset_name, Train=False) return (train_dataset, test_dataset) # custom datset. if args.dataset_name in dataset_dict: url, index_file_name_trainval, index_file_name_test, classes, \ = dataset_dict[args.dataset_name] data_root = os.path.join(args.dataset_root, args.dataset_name) if not args.no_redownload: root = args.dataset_root filename_zip = ag.download(url, path=root) filename = ag.unzip(filename_zip, root=root) data_root = os.path.join(root, filename) else: logging.info("This dataset is not in dataset_dict. It should be downloaded before running this script.") index_file_name_trainval = args.index_file_name_trainval index_file_name_test = args.index_file_name_test classes = args.classes train_dataset = task.Dataset(data_root, index_file_name=index_file_name_trainval, classes=classes) test_dataset = task.Dataset(data_root, index_file_name=index_file_name_test, classes=classes, Train=False) return (train_dataset, test_dataset)
def download_shopee(dataset, data_path): if not os.path.exists(os.path.join(data_path, dataset + '.zip')): filename = ag.download( 'https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip', path='data/') ag.mkdir(filename[:-4]) ag.unzip(filename, root=filename[:-4]) else: print(dataset + '.zip already exists.\n')
def load_data(directory_prefix, train_file, test_file, name, url=None): if not os.path.exists(directory_prefix): os.mkdir(directory_prefix) directory = directory_prefix + name + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (name, url)) zip_name = ag.download(url, directory_prefix) ag.unzip(zip_name, directory_prefix) os.remove(zip_name) train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) return train_data, test_data
import autogluon as ag from autogluon import ObjectDetection as task from console_logging.console import Console console = Console() console.log("Baixando Dataset...") root = './' filename_zip = ag.download( 'https://autogluon.s3.amazonaws.com/datasets/tiny_motorbike.zip', path=root) filename = ag.unzip(filename_zip, root=root) console.log("Criando TASK TRAIN ") import os data_root = os.path.join(root, filename) dataset_train = task.Dataset(data_root, classes=('motorbike', )) console.info("TRAINING DATA MODEL...") time_limits = 5 * 60 * 60 # 5 hours epochs = 30 detector = task.fit(dataset_train, num_trials=2, epochs=epochs, lr=ag.Categorical(5e-4, 1e-4), ngpus_per_trial=1, time_limits=time_limits) console.success("TRAINING DONE !") console.log("START TEST MODEL ") dataset_test = task.Dataset(data_root, index_file_name='test', classes=('motorbike', ))
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None): print("Running fit with args:") print(fit_args) # Each train/test dataset must be located in single directory with the given names. train_file = 'train_data.csv' test_file = 'test_data.csv' EPS = 1e-10 # Information about each dataset in benchmark is stored in dict. # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks binary_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY, 'label_column': 'class', 'performance_val': 0.129} # Mixed types of features. multi_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip', 'name': 'CoverTypeMulticlassClassification', 'problem_type': MULTICLASS, 'label_column': 'Cover_Type', 'performance_val': 0.032} # big dataset with 7 classes, all features are numeric. Runs SLOW. regression_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AmesHousingPriceRegression.zip', 'name': 'AmesHousingPriceRegression', 'problem_type': REGRESSION, 'label_column': 'SalePrice', 'performance_val': 0.076} # Regression with mixed feature-types, skewed Y-values. toyregres_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyRegression.zip', 'name': 'toyRegression', 'problem_type': REGRESSION, 'label_column': 'y', 'performance_val': 0.183} # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data toyclassif_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyClassification.zip', 'name': 'toyClassification', 'problem_type': MULTICLASS, 'label_column': 'y', 'performance_val': 0.436} # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data # toyclassif_dataset should produce 3 warnings: # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them): ['distractioncolumn1', 'distractioncolumn2'] # UserWarning: The columns listed below from the training data are no longer in the given dataset. (AutoGluon will proceed assuming their values are missing, but you should remove these columns from training dataset and train a new model): ['lostcolumn'] # UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. # List containing dicts for each dataset to include in benchmark (try to order based on runtimes) datasets = [toyregres_dataset, toyclassif_dataset, binary_dataset, regression_dataset, multi_dataset] if dataset_indices is not None: # only run some datasets datasets = [datasets[i] for i in dataset_indices] # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len(datasets) # performance obtained in this run directory_prefix = './datasets/' if not os.path.exists(directory_prefix): os.mkdir(directory_prefix) with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): if seed_val is not None: seed(seed_val) np.random.seed(seed_val) mx.random.seed(seed_val) dataset = datasets[idx] print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets))) directory = directory_prefix + dataset['name'] + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) zip_name = ag.download(dataset['url'], directory_prefix) ag.unzip(zip_name, directory_prefix) os.remove(zip_name) savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label_column = dataset['label_column'] train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column], axis=1) if fast_benchmark: if subsample_size is None: raise ValueError("fast_benchmark specified without subsample_size") train_data = train_data.head(subsample_size) # subsample for fast_benchmark predictor = None # reset from last Dataset predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, **fit_args) results = predictor.fit_summary(verbosity=0) if predictor.problem_type != dataset['problem_type']: warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = None # We delete predictor here to test loading previously-trained predictor from file predictor = task.load(savedir) y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict['accuracy_score'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict['r2_score'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn("Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val']))) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance))) print("Ran fit with args:") print(fit_args) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)