def test_cpu_only_raise(set_env_train_without_gpu): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label='label', eval_metric='acc') if set_env_train_without_gpu is None: with pytest.raises(RuntimeError): predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, seed=123) elif set_env_train_without_gpu is True: os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '1' predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, time_limit=30, seed=123) verify_predictor_save_load(predictor, dev_data, verify_proba=True) else: with pytest.raises(RuntimeError): predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, seed=123)
def test_cpu_only_raise(): train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] with pytest.raises(RuntimeError): predictor = task.fit(train_data, hyperparameters=test_hyperparameters, label='label', num_trials=1, ngpus_per_trial=0, verbosity=4, output_directory='./sst', plot_results=False) os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '1' predictor = task.fit(train_data, hyperparameters=test_hyperparameters, label='label', num_trials=1, ngpus_per_trial=0, verbosity=4, output_directory='./sst', plot_results=False) os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '0' with pytest.raises(RuntimeError): predictor = task.fit(train_data, hyperparameters=test_hyperparameters, label='label', num_trials=1, ngpus_per_trial=0, verbosity=4, output_directory='./sst', plot_results=False)
def test_predictor_fit(key): train_data = load_pd.load(DATA_INFO[key]['train']) dev_data = load_pd.load(DATA_INFO[key]['dev']) label = DATA_INFO[key]['label'] eval_metric = DATA_INFO[key]['metric'] verify_proba = DATA_INFO[key]['verify_proba'] rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label=label, eval_metric=eval_metric) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_score = predictor.evaluate(dev_data) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba) # Test for continuous fit predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba) # Saving to folder, loading the saved model and call fit again (continuous fit) with tempfile.TemporaryDirectory() as root: predictor.save(root) predictor = TextPredictor.load(root) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123)
def test_mixed_column_type(): train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sts/train.parquet') dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sts/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] # Add more columns as feature train_data = pd.DataFrame({'sentence1': train_data['sentence1'], 'sentence2': train_data['sentence2'], 'sentence3': train_data['sentence2'], 'categorical0': train_data['genre'], 'numerical0': train_data['score'], 'genre': train_data['genre'], 'score': train_data['score']}) dev_data = pd.DataFrame({'sentence1': dev_data['sentence1'], 'sentence2': dev_data['sentence2'], 'sentence3': dev_data['sentence2'], 'categorical0': dev_data['genre'], 'numerical0': dev_data['score'], 'genre': dev_data['genre'], 'score': dev_data['score']}) # Train Regression predictor1 = task.fit(train_data, hyperparameters=test_hyperparameters, label='score', num_trials=1, verbosity=4, ngpus_per_trial=1, output_directory='./sts_score', plot_results=False) dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse']) verify_predictor_save_load(predictor1, dev_data) # Train Classification predictor2 = task.fit(train_data, hyperparameters=test_hyperparameters, label='genre', num_trials=1, verbosity=4, ngpus_per_trial=1, output_directory='./sts_genre', plot_results=False) dev_rmse = predictor2.evaluate(dev_data, metrics=['acc']) verify_predictor_save_load(predictor2, dev_data, verify_proba=True) # Specify the feature column predictor3 = task.fit(train_data, hyperparameters=test_hyperparameters, feature_columns=['sentence1', 'sentence3', 'categorical0'], label='score', num_trials=1, verbosity=4, ngpus_per_trial=1, output_directory='./sts_score', plot_results=False) dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse']) verify_predictor_save_load(predictor3, dev_data)
def train(args): set_seed(args.seed) if args.task is not None: feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task] else: raise NotImplementedError if args.exp_dir is None: args.exp_dir = 'autogluon_text_{}'.format(args.task) train_df = load_pd.load(args.train_file) dev_df = load_pd.load(args.dev_file) test_df = load_pd.load(args.test_file) train_df = train_df[feature_columns + [label_column]] dev_df = dev_df[feature_columns + [label_column]] test_df = test_df[feature_columns] if args.task == 'mrpc' or args.task == 'sts': # Augmenting the un-ordered set manually. train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]], feature_columns[1]: train_df[feature_columns[0]], label_column: train_df[label_column]}) real_train_df = pd.concat([train_df, train_df_other_part]) real_dev_df = dev_df else: real_train_df = train_df real_dev_df = dev_df if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal') elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, seed=args.seed) else: raise NotImplementedError dev_metric_score = predictor.evaluate(dev_df) dev_predictions = predictor.predict(dev_df, as_pandas=True) test_predictions = predictor.predict(test_df, as_pandas=True) dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv')) test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv')) with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of: json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
def test_mixed_column_type(): train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sts/train.parquet') dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sts/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:1000]] dev_data = dev_data.iloc[valid_perm[:10]] # Add more columns as feature train_data = pd.DataFrame({'sentence1': train_data['sentence1'], 'sentence2': train_data['sentence2'], 'sentence3': train_data['sentence2'], 'categorical0': train_data['genre'], 'numerical0': train_data['score'], 'genre': train_data['genre'], 'score': train_data['score']}) dev_data = pd.DataFrame({'sentence1': dev_data['sentence1'], 'sentence2': dev_data['sentence2'], 'sentence3': dev_data['sentence2'], 'categorical0': dev_data['genre'], 'numerical0': dev_data['score'], 'genre': dev_data['genre'], 'score': dev_data['score']}) # Train Regression predictor = TextPredictor(label='score', verbosity=4) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_rmse = predictor.evaluate(dev_data, metrics=['rmse']) verify_predictor_save_load(predictor, dev_data) # Train Classification predictor = TextPredictor(label='genre', verbosity=4) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_rmse = predictor.evaluate(dev_data, metrics=['acc']) verify_predictor_save_load(predictor, dev_data, verify_proba=True) # Specify the feature column predictor = TextPredictor(label='score', verbosity=4) predictor.fit(train_data[['sentence1', 'sentence3', 'categorical0', 'score']], hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_rmse = predictor.evaluate(dev_data, metrics=['rmse']) verify_predictor_save_load(predictor, dev_data)
def test_cpu_only_warning(): train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label='label', eval_metric='acc') with pytest.warns(UserWarning): predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, seed=123)
def test_sst(hyperparameters): train_data = load_pd.load( 'https://autogluon-text-data.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load( 'https://autogluon-text-data.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label='label', eval_metric='acc') predictor.fit(train_data, hyperparameters=hyperparameters) dev_acc = predictor.evaluate(dev_data, metrics=['acc']) verify_predictor_save_load(predictor, dev_data, verify_proba=True)
def test_no_job_finished_raise(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') with pytest.raises(RuntimeError): # Setting a very small time limits to trigger the bug predictor = task.fit(train_data, hyperparameters=test_hyperparameters, label='label', num_trials=1, ngpus_per_trial=0, verbosity=4, time_limits=10, output_directory='./sst_raise', plot_results=False)
def test_sts(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet') dev_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = task.fit(train_data, hyperparameters=test_hyperparameters, label='score', num_trials=1, verbosity=4, ngpus_per_trial=1, output_directory='./sts', plot_results=False) dev_rmse = predictor.evaluate(dev_data, metrics=['rmse']) verify_predictor_save_load(predictor, dev_data)
def test_empty_text_item(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) train_data = train_data.iloc[train_perm[:100]] train_data.iat[0, 0] = None train_data.iat[10, 0] = None predictor = TextPredictor(label='score', verbosity=4) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30)
def test_predictor_fit(key): train_data = load_pd.load(DATA_INFO[key]['train']) dev_data = load_pd.load(DATA_INFO[key]['dev']) label = DATA_INFO[key]['label'] eval_metric = DATA_INFO[key]['metric'] verify_proba = DATA_INFO[key]['verify_proba'] rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label=label, eval_metric=eval_metric) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_score = predictor.evaluate(dev_data) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)
def __init__(self, path_or_df: Union[str, pd.DataFrame], *, columns=None, label_columns=None, column_metadata: Optional[Union[str, Dict]] = None, column_properties: Optional[collections.OrderedDict] = None, categorical_default_handle_missing_value=True): """ Parameters ---------- path_or_df The path or dataframe of the tabular dataset for NLP. columns The chosen columns to load the data label_columns The name of the label columns. This helps to infer the column properties. column_metadata The metadata object that describes the property of the columns in the dataset column_properties The given column properties categorical_default_handle_missing_value Whether to handle missing value in categorical columns by default """ super().__init__() if isinstance(path_or_df, pd.DataFrame): df = path_or_df else: df = load_pd.load(path_or_df) if columns is not None: if not isinstance(columns, list): columns = [columns] df = df[columns] df = normalize_df(df) if column_metadata is None: column_metadata = dict() elif isinstance(column_metadata, str): with open(column_metadata, 'r') as f: column_metadata = json.load(f) # Inference the column properties column_properties = get_column_properties( df, metadata=column_metadata, label_columns=label_columns, provided_column_properties=column_properties, categorical_default_handle_missing_value=categorical_default_handle_missing_value) for col_name, prop in column_properties.items(): if prop.type == _C.TEXT: df[col_name] = df[col_name].fillna('').apply(str) elif prop.type == _C.NUMERICAL: df[col_name] = df[col_name].fillna(-1).apply(np.array) self._table = df self._column_properties = column_properties
def test_no_job_finished_raise(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') with pytest.raises(RuntimeError): # Setting a very small time limits to trigger the bug predictor = TextPredictor(label='label') predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=1, num_gpus=1, seed=123)
def test_mrpc(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/glue/mrpc/train.parquet' ) dev_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/glue/mrpc/dev.parquet' ) rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = task.fit(train_data, hyperparameters=test_hyperparameters, label='label', num_trials=1, verbosity=4, ngpus_per_trial=1, output_directory='./mrpc', plot_results=False) dev_acc = predictor.evaluate(dev_data, metrics=['acc']) dev_prediction = predictor.predict(dev_data) dev_pred_prob = predictor.predict_proba(dev_data)
def test_empty_text_item(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) train_data = train_data.iloc[train_perm[:100]] train_data.iat[0, 0] = None train_data.iat[10, 0] = None predictor = task.fit(train_data, hyperparameters=test_hyperparameters, label='score', num_trials=1, ngpus_per_trial=0, verbosity=4, output_directory='./sts_empty_text_item', plot_results=False)
def test_preprocessor(dataset_name, url, label_column, backbone_name, all_to_text): all_df = load_pd.load(url) feature_columns = [col for col in all_df.columns if col != label_column] train_df, valid_df = train_test_split( all_df, test_size=0.1, random_state=np.random.RandomState(100)) column_types, problem_type = infer_column_problem_types( train_df, valid_df, label_columns=label_column) cfg = base_preprocess_cfg() if all_to_text: cfg.defrost() cfg.categorical.convert_to_text = True cfg.numerical.convert_to_text = True cfg.freeze() preprocessor = MultiModalTextFeatureProcessor(column_types=column_types, label_column=label_column, tokenizer_name=backbone_name, cfg=cfg) train_dataset = preprocessor.fit_transform(train_df[feature_columns], train_df[label_column]) train_dataset_after_transform = preprocessor.transform( train_df[feature_columns], train_df[label_column]) for i in range(len(train_dataset)): for j in range(len(train_dataset[0])): npt.assert_allclose(train_dataset[i][j], train_dataset_after_transform[i][j], 1E-4, 1E-4) valid_dataset = preprocessor.transform(valid_df[feature_columns], valid_df[label_column]) test_dataset = preprocessor.transform(valid_df[feature_columns]) assert_dataset_match(train_dataset, train_dataset_after_transform) for i in range(len(test_dataset)): for j in range(len(test_dataset[0])): npt.assert_allclose(valid_dataset[i][j], test_dataset[i][j], 1E-4, 1E-4) # Test for pickle dump and load with tempfile.TemporaryDirectory() as tmp_dir_name: with open(os.path.join(tmp_dir_name, 'preprocessor.pkl'), 'wb') as out_f: pickle.dump(preprocessor, out_f) with open(os.path.join(tmp_dir_name, 'preprocessor.pkl'), 'rb') as in_f: preprocessor_loaded = pickle.load(in_f) valid_dataset_loaded = preprocessor_loaded.transform( valid_df[feature_columns], valid_df[label_column]) assert_dataset_match(valid_dataset_loaded, valid_dataset) test_dataset_loaded = preprocessor_loaded.transform( valid_df[feature_columns]) assert_dataset_match(test_dataset_loaded, test_dataset)
def __init__(self, *args, **kwargs): file_path = kwargs.get('file_path', None) name = kwargs.get('name', None) feature_types = kwargs.get('feature_types', None) df = kwargs.get('df', None) subsample = kwargs.get('subsample', None) copy = kwargs.get('copy', False) construct_from_df = False # whether or not we are constructing new dataset object from scratch based on provided DataFrame. # if df is None and file_path is None: # Cannot be used currently! # raise ValueError("Must specify either named argument 'file_path' or 'df' in order to construct tabular Dataset") if df is not None: # Create Dataset from existing Python DataFrame: construct_from_df = True if not isinstance(df, pd.DataFrame): raise ValueError( "'df' must be existing pandas DataFrame. To read dataset from file instead, use 'file_path' string argument." ) if file_path is not None: warnings.warn( "Both 'df' and 'file_path' supplied. Creating dataset based on DataFrame 'df' rather than reading from file_path." ) if copy: df = df.copy(deep=True) elif file_path is not None: # Read from file to create dataset construct_from_df = True df = load_pd.load(file_path) if construct_from_df: # Construct new Dataset object based off of DataFrame if subsample is not None: if not isinstance(subsample, int) or subsample <= 1: raise ValueError("'subsample' must be of type int and > 1") df = df.head(subsample) super().__init__(df) self.file_path = file_path self.name = name self.feature_types = feature_types self.subsample = subsample else: super().__init__(*args, **kwargs)
def fit(self, train_data, tuning_data=None, time_limit=None, presets=None, hyperparameters=None, feature_columns=None, column_types=None, num_cpus=None, num_gpus=None, num_trials=None, seed=None): """Fit the predictor Parameters ---------- train_data The training data tuning_data The tuning data time_limit The time limits presets The user can specify the presets of the hyper-parameters. hyperparameters The hyper-parameters feature_columns Specify which columns in the data column_types The provided type of the columns num_cpus The number of CPUs to use for each trial num_gpus The number of GPUs to use for each trial num_trials The number of trials. By default, we will use the provided number of trials in the hyperparameters or presets. This will overwrite the provided value. seed The seed of the experiment Returns ------- self """ assert self._fit_called is False if presets is not None: preset_hparams = ag_text_presets.create(presets) else: preset_hparams = ag_text_presets.create('default') hyperparameters = merge_params(preset_hparams, hyperparameters) if seed is not None: hyperparameters['seed'] = seed seed = hyperparameters['seed'] if num_trials is not None: hyperparameters['hpo_params']['num_trials'] = num_trials if isinstance(self._label, str): label_columns = [self._label] else: label_columns = list(self._label) # Get the training and tuning data as pandas dataframe if not isinstance(train_data, pd.DataFrame): train_data = load_pd.load(train_data) if feature_columns is None: all_columns = list(train_data.columns) feature_columns = [ ele for ele in all_columns if ele not in label_columns ] else: if isinstance(feature_columns, str): feature_columns = [feature_columns] for col in feature_columns: assert col not in label_columns, 'Feature columns and label columns cannot overlap.' assert col in train_data.columns,\ 'Feature columns must be in the pandas dataframe! Received col = "{}", ' \ 'all columns = "{}"'.format(col, train_data.columns) all_columns = feature_columns + label_columns train_data = train_data[all_columns] # Get tuning data if tuning_data is not None: if not isinstance(tuning_data, pd.DataFrame): tuning_data = load_pd.load(tuning_data) tuning_data = tuning_data[all_columns] else: if hyperparameters['misc']['holdout_frac'] is not None: holdout_frac = hyperparameters['misc']['holdout_frac'] else: num_trials = hyperparameters['hpo_params']['num_trials'] if num_trials == 1: holdout_frac = default_holdout_frac(len(train_data), False) else: # For HPO, we will need to use a larger held-out ratio holdout_frac = default_holdout_frac(len(train_data), True) train_data, tuning_data = train_test_split( train_data, test_size=holdout_frac, random_state=np.random.RandomState(seed)) column_types, problem_type = infer_column_problem_types( train_data, tuning_data, label_columns=label_columns, problem_type=self._problem_type, provided_column_types=column_types) self._eval_metric, log_metrics = infer_eval_log_metrics( problem_type=problem_type, eval_metric=self._eval_metric) has_text_column = False for k, v in column_types.items(): if v == _C.TEXT: has_text_column = True break if not has_text_column: raise AssertionError( 'No Text Column is found! This is currently not supported by ' 'the TextPrediction task. You may try to use ' 'autogluon.tabular.TabularPredictor.\n' 'The inferred column properties of the training data is {}'. format(train_data)) logger.log(25, 'Problem Type="{}"'.format(problem_type)) logger.log(25, printable_column_type_string(column_types)) self._problem_type = problem_type model_hparams = hyperparameters['models']['MultimodalTextModel'] self._backend = model_hparams['backend'] if model_hparams['backend'] == 'gluonnlp_v0': from ..mx.models import MultiModalTextModel self._model = MultiModalTextModel(column_types=column_types, feature_columns=feature_columns, label_columns=label_columns, problem_type=self._problem_type, eval_metric=self._eval_metric, log_metrics=log_metrics, output_directory=self._path) self._model.train(train_data=train_data, tuning_data=tuning_data, num_cpus=num_cpus, num_gpus=num_gpus, search_space=model_hparams['search_space'], hpo_params=hyperparameters['hpo_params'], time_limit=time_limit, seed=seed, verbosity=self.verbosity) else: raise NotImplementedError( "Currently, we only support using " "the autogluon-contrib-nlp and MXNet " "as the backend of AutoGluon-Text. In the future, " "we will support other models.") return self
def test_multimodal_batchify(dataset_name, url, label_column, backbone_name, all_to_text, insert_sep, stochastic_chunk): # Test for multimodal batchify all_df = load_pd.load(url) feature_columns = [col for col in all_df.columns if col != label_column] train_df, valid_df = train_test_split(all_df, test_size=0.1, random_state=np.random.RandomState(100)) column_types, problem_type = infer_column_problem_types(train_df, valid_df, label_columns=label_column) cfg = base_preprocess_cfg() if all_to_text: cfg.defrost() cfg.categorical.convert_to_text = True cfg.numerical.convert_to_text = True cfg.freeze() preprocessor = MultiModalTextFeatureProcessor(column_types=column_types, label_column=label_column, tokenizer_name=backbone_name, cfg=cfg) cls_id, sep_id = get_cls_sep_id(preprocessor.tokenizer) train_dataset = preprocessor.fit_transform(train_df[feature_columns], train_df[label_column]) test_dataset = preprocessor.transform(valid_df[feature_columns]) auto_max_length = auto_shrink_max_length(train_dataset=train_dataset, insert_sep=insert_sep, num_text_features=len(preprocessor.text_feature_names), auto_max_length_quantile=0.9, round_to=32, max_length=512) train_batchify_fn = MultiModalTextBatchify(num_text_inputs=len(preprocessor.text_feature_names), num_categorical_inputs=len(preprocessor.categorical_feature_names), num_numerical_inputs=len(preprocessor.numerical_feature_names) > 0, cls_token_id=cls_id, sep_token_id=sep_id, max_length=auto_max_length, mode='train', stochastic_chunk=stochastic_chunk, insert_sep=insert_sep) test_batchify_fn = MultiModalTextBatchify(num_text_inputs=len(preprocessor.text_feature_names), num_categorical_inputs=len(preprocessor.categorical_feature_names), num_numerical_inputs=len(preprocessor.numerical_feature_names) > 0, cls_token_id=cls_id, sep_token_id=sep_id, max_length=auto_max_length, mode='test', stochastic_chunk=stochastic_chunk, insert_sep=insert_sep) train_dataloader = DataLoader(train_dataset, batch_size=4, batchify_fn=train_batchify_fn, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=4, batchify_fn=test_batchify_fn, shuffle=False) for sample in train_dataloader: features, label = sample[0], sample[1] assert len(features) == train_batchify_fn.num_text_outputs + \ train_batchify_fn.num_categorical_outputs + train_batchify_fn.num_numerical_outputs text_token_ids, text_valid_length, text_segment_ids = features[0] assert text_token_ids.shape[1] <= auto_max_length assert text_segment_ids.shape[1] <= auto_max_length assert text_token_ids.shape == text_segment_ids.shape for sample in test_dataloader: assert len(sample) == test_batchify_fn.num_text_outputs + \ test_batchify_fn.num_categorical_outputs + test_batchify_fn.num_numerical_outputs text_token_ids, text_valid_length, text_segment_ids = sample[0] assert text_token_ids.shape[1] <= auto_max_length assert text_segment_ids.shape[1] <= auto_max_length assert text_token_ids.shape == text_segment_ids.shape
def test_distillation(): train_data = load_pd.load( "https://autogluon-text.s3-accelerate.amazonaws.com/" "glue/sst/train.parquet") test_data = load_pd.load( "https://autogluon-text.s3-accelerate.amazonaws.com/" "glue/sst/dev.parquet") rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) test_perm = rng_state.permutation(len(test_data)) train_data = train_data.iloc[train_perm[:100]] test_data = test_data.iloc[test_perm[:10]] teacher_predictor = TextPredictor(label="label", eval_metric="acc") hyperparameters = { "model.hf_text.checkpoint_name": "prajjwal1/bert-tiny", "env.num_workers": 0, "env.num_workers_evaluation": 0, } teacher_save_path = os.path.join("sst", "teacher") if os.path.exists(teacher_save_path): shutil.rmtree(teacher_save_path) teacher_predictor = teacher_predictor.fit( train_data=train_data, hyperparameters=hyperparameters, time_limit=30, save_path=teacher_save_path, ) # test for distillation predictor = TextPredictor(label="label", eval_metric="acc") student_save_path = os.path.join("sst", "student") if os.path.exists(student_save_path): shutil.rmtree(student_save_path) predictor = predictor.fit( train_data=train_data, teacher_predictor=teacher_predictor, hyperparameters=hyperparameters, time_limit=30, save_path=student_save_path, ) verify_predictor_save_load(predictor, test_data) # test for distillation with teacher predictor path predictor = TextPredictor(label="label", eval_metric="acc") student_save_path = os.path.join("sst", "student") if os.path.exists(student_save_path): shutil.rmtree(student_save_path) predictor = predictor.fit( train_data=train_data, teacher_predictor=teacher_predictor.path, hyperparameters=hyperparameters, time_limit=30, save_path=student_save_path, ) verify_predictor_save_load(predictor, test_data)
def fit(cls, train_data, label, tuning_data=None, time_limits=None, output_directory='./ag_text', feature_columns=None, holdout_frac=None, eval_metric=None, stopping_metric=None, nthreads_per_trial=None, ngpus_per_trial=None, dist_ip_addrs=None, num_trials=None, search_strategy=None, search_options=None, scheduler_options=None, hyperparameters=None, plot_results=None, seed=None, verbosity=2): """Fit models to make predictions based on text inputs. Parameters ---------- train_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame` Training dataset where rows = individual training examples, columns = features. label : str Name of the label column. It can be a stringBy default, we will search for a column named tuning_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame`, default = None Another dataset containing validation data reserved for hyperparameter tuning (in same format as training data). If `tuning_data = None`, `fit()` will automatically hold out random examples from `train_data` for validation. time_limits : int or str, default = None Approximately how long `fit()` should run for (wallclock time in seconds if int). String values may instead be used to specify time in different units such as: '1min' or '1hour'. Longer `time_limits` will usually improve predictive accuracy. If not specified, `fit()` will run until all models to try by default have completed training. output_directory : str, default = './ag_text' Path to directory where models and intermediate outputs should be saved. feature_columns : List[str], default = None Which columns of table to consider as predictive features (other columns will be ignored, except for label-column). If None (by default), all columns of table are considered predictive features. holdout_frac : float, default = None Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`). If None, default value is selected based on the number of training examples. eval_metric : str, default = None The evaluation metric that will be used to evaluate the model's predictive performance. If None, an appropriate default metric will be selected (accuracy for classification, mean-squared-error for regression). Options for classification include: 'acc' (accuracy), 'nll' (negative log-likelihood). Additional options for binary classification include: 'f1' (F1 score), 'mcc' (Matthews coefficient), 'auc' (area under ROC curve). Options for regression include: 'mse' (mean squared error), 'rmse' (root mean squared error), 'mae' (mean absolute error). stopping_metric, default = None Metric which iteratively-trained models use to early stop to avoid overfitting. Defaults to `eval_metric` value (if None). Options are identical to options for `eval_metric`. nthreads_per_trial, default = None The number of threads per individual model training run. By default, all available CPUs are used. ngpus_per_trial, default = None The number of GPUs to use per individual model training run. If unspecified, a default value is chosen based on total number of GPUs available. dist_ip_addrs, default = None List of IP addresses corresponding to remote workers, in order to leverage distributed computation. num_trials : , default = None The number of trials in the HPO search search_strategy : str, default = None Which hyperparameter search algorithm to use. Options include: 'random' (random search), 'bayesopt' (Gaussian process Bayesian optimization), 'skopt' (SKopt Bayesian optimization), 'grid' (grid search), 'hyperband' (Hyperband scheduling with random search), 'bayesopt-hyperband' (Hyperband scheduling with GP-BO search). If unspecified, the default is 'random'. search_options : dict, default = None Options passed to searcher. scheduler_options : dict, default = None Additional kwargs passed to scheduler __init__. hyperparameters : dict, default = None Determines the hyperparameters used by the models. Each hyperparameter may be either fixed value or search space of many values. For example of default hyperparameters, see: `autogluon.task.text_prediction.text_prediction.default()` plot_results : bool, default = None Whether or not to plot intermediate training results during `fit()`. seed : int, default = None Seed value for random state used inside `fit()`. verbosity : int, default = 2 Verbosity levels range from 0 to 4 and control how much information is printed during fit(). Higher levels correspond to more detailed print statements (you can set verbosity = 0 to suppress warnings). If using logging, you can alternatively control amount of information printed via `logger.setLevel(L)`, where `L` ranges from 0 to 50 (Note: higher values of `L` correspond to fewer print statements, opposite of verbosity levels) Returns ------- model A `BertForTextPredictionBasic` object that can be used for making predictions on new data. """ assert dist_ip_addrs is None, 'Training on remote machine is currently not supported.' # Version check of MXNet if version.parse(mxnet.__version__) < version.parse('1.7.0') \ or version.parse(mxnet.__version__) >= version.parse('2.0.0'): raise ImportError( 'You will need to ensure that you have mxnet>=1.7.0, <2.0.0. ' 'For more information about how to install mxnet, you can refer to ' 'https://sxjscience.github.io/KDD2020/ .') if verbosity < 0: verbosity = 0 elif verbosity > 4: verbosity = 4 console_log = verbosity >= 2 logging_config(folder=output_directory, name='ag_text_prediction', logger=logger, level=verbosity2loglevel(verbosity), console=console_log) # Parse the hyper-parameters if hyperparameters is None: hyperparameters = ag_text_prediction_params.create('default') elif isinstance(hyperparameters, str): hyperparameters = ag_text_prediction_params.create(hyperparameters) else: base_params = ag_text_prediction_params.create('default') hyperparameters = merge_params(base_params, hyperparameters) np.random.seed(seed) if not isinstance(train_data, pd.DataFrame): train_data = load_pd.load(train_data) # Inference the label if not isinstance(label, list): label = [label] label_columns = [] for ele in label: if isinstance(ele, int): label_columns.append(train_data.columns[ele]) else: label_columns.append(ele) if feature_columns is None: all_columns = list(train_data.columns) feature_columns = [ ele for ele in all_columns if ele not in label_columns ] else: if isinstance(feature_columns, str): feature_columns = [feature_columns] for col in feature_columns: assert col not in label_columns, 'Feature columns and label columns cannot overlap.' assert col in train_data.columns,\ 'Feature columns must be in the pandas dataframe! Received col = "{}", ' \ 'all columns = "{}"'.format(col, train_data.columns) all_columns = feature_columns + label_columns all_columns = [ ele for ele in train_data.columns if ele in all_columns ] if tuning_data is None: if holdout_frac is None: holdout_frac = default_holdout_frac(len(train_data), True) train_data, tuning_data = random_split_train_val( train_data, valid_ratio=holdout_frac) else: if not isinstance(tuning_data, pd.DataFrame): tuning_data = load_pd.load(tuning_data) train_data = train_data[all_columns] tuning_data = tuning_data[all_columns] column_properties = get_column_properties( pd.concat([train_data, tuning_data]), metadata=None, label_columns=label_columns, provided_column_properties=None, categorical_default_handle_missing_value=True) train_data = TabularDataset(train_data, column_properties=column_properties, label_columns=label_columns) tuning_data = TabularDataset( tuning_data, column_properties=train_data.column_properties, label_columns=label_columns) logger.info('Train Dataset:') logger.info(train_data) logger.info('Tuning Dataset:') logger.info(tuning_data) logger.debug('Hyperparameters:') logger.debug(hyperparameters) has_text_column = False for k, v in column_properties.items(): if v.type == _C.TEXT: has_text_column = True break if not has_text_column: raise NotImplementedError('No Text Column is found! This is currently not supported by ' 'the TextPrediction task. You may try to use ' 'TabularPrediction.fit().\n' \ 'The inferred column properties of the training data is {}' .format(train_data)) problem_types = [] label_shapes = [] for label_col_name in label_columns: problem_type, label_shape = infer_problem_type( column_properties=column_properties, label_col_name=label_col_name) problem_types.append(problem_type) label_shapes.append(label_shape) logging.info( 'Label columns={}, Feature columns={}, Problem types={}, Label shapes={}' .format(label_columns, feature_columns, problem_types, label_shapes)) eval_metric, stopping_metric, log_metrics =\ infer_eval_stop_log_metrics(problem_types[0], label_shapes[0], eval_metric=eval_metric, stopping_metric=stopping_metric) logging.info('Eval Metric={}, Stop Metric={}, Log Metrics={}'.format( eval_metric, stopping_metric, log_metrics)) model_candidates = [] for model_type, kwargs in hyperparameters['models'].items(): search_space = kwargs['search_space'] if model_type == 'BertForTextPredictionBasic': model = BertForTextPredictionBasic( column_properties=column_properties, label_columns=label_columns, feature_columns=feature_columns, label_shapes=label_shapes, problem_types=problem_types, stopping_metric=stopping_metric, log_metrics=log_metrics, base_config=None, search_space=search_space, output_directory=output_directory, logger=logger) model_candidates.append(model) else: raise ValueError( 'model_type = "{}" is not supported. You can try to use ' 'model_type = "BertForTextPredictionBasic"'.format( model_type)) assert len( model_candidates) == 1, 'Only one model is supported currently' recommended_resource = get_recommended_resource( nthreads_per_trial=nthreads_per_trial, ngpus_per_trial=ngpus_per_trial) if search_strategy is None: search_strategy = hyperparameters['hpo_params']['search_strategy'] if time_limits is None: time_limits = hyperparameters['hpo_params']['time_limits'] else: if isinstance(time_limits, str): if time_limits.endswith('min'): time_limits = int(float(time_limits[:-3]) * 60) elif time_limits.endswith('hour'): time_limits = int(float(time_limits[:-4]) * 60 * 60) else: raise ValueError( 'The given time_limits="{}" cannot be parsed!'.format( time_limits)) if num_trials is None: num_trials = hyperparameters['hpo_params']['num_trials'] if scheduler_options is None: scheduler_options = hyperparameters['hpo_params'][ 'scheduler_options'] if scheduler_options is None: scheduler_options = dict() if search_strategy.endswith('hyperband'): # Specific defaults for hyperband scheduling scheduler_options['reduction_factor'] = scheduler_options.get( 'reduction_factor', 4) scheduler_options['grace_period'] = scheduler_options.get( 'grace_period', 10) scheduler_options['max_t'] = scheduler_options.get('max_t', 50) if recommended_resource['num_gpus'] == 0: warnings.warn( 'Recommend to use GPU to run the TextPrediction task!') model = model_candidates[0] if plot_results is None: if in_ipynb(): plot_results = True else: plot_results = False model.train(train_data=train_data, tuning_data=tuning_data, resource=recommended_resource, time_limits=time_limits, search_strategy=search_strategy, search_options=search_options, scheduler_options=scheduler_options, num_trials=num_trials, plot_results=plot_results, console_log=verbosity > 2, ignore_warning=verbosity <= 2) return model
def fit(self, train_data, tuning_data=None, time_limit=None, presets=None, hyperparameters=None, column_types=None, num_cpus=None, num_gpus=None, num_trials=None, plot_results=None, holdout_frac=None, seed=0): """ Fit Transformer models to predict label column of a data table based on the other columns (which may contain text or numeric/categorical features). Parameters ---------- train_data : str or :class:`TabularDataset` or :class:`pd.DataFrame` Table of the training data, which is similar to a pandas DataFrame. If str is passed, `train_data` will be loaded using the str value as the file path. tuning_data : str or :class:`TabularDataset` or :class:`pd.DataFrame`, default = None Another dataset containing validation data reserved for tuning processes such as early stopping and hyperparameter tuning. This dataset should be in the same format as `train_data`. If str is passed, `tuning_data` will be loaded using the str value as the file path. Note: final model returned may be fit on `tuning_data` as well as `train_data`. Do not provide your evaluation test data here! If `tuning_data = None`, `fit()` will automatically hold out some random validation examples from `train_data`. time_limit : int, default = None Approximately how long `fit()` should run for (wallclock time in seconds). If not specified, `fit()` will run until the model has completed training. presets : str, default = None Presets are pre-registered configurations that control training (hyperparameters and other aspects). It is recommended to specify presets and avoid specifying most other `fit()` arguments or model hyperparameters prior to becoming familiar with AutoGluon. Print all available presets via `autogluon.text.list_presets()`. Some notable presets include: - "best_quality": produce the most accurate overall predictor (regardless of its efficiency). - "medium_quality_faster_train": produce an accurate predictor but take efficiency into account (this is the default preset). - "lower_quality_fast_train": produce a predict that is quick to train and make predictions with, even if its accuracy is worse. hyperparameters : dict, default = None The hyperparameters of the `fit()` function, which affect the resulting accuracy of the trained predictor. Experienced AutoGluon users can use this argument to specify neural network hyperparameter values/search-spaces as well as which hyperparameter-tuning strategy should be employed. See the "Text Prediction" tutorials for examples. column_types : dict, default = None The type of data in each table column can be specified via a dictionary that maps the column name to its data type. For example: `column_types = {"item_name": "text", "brand": "text", "product_description": "text", "height": "numerical"}` may be used for a table with columns: "item_name", "brand", "product_description", and "height". If None, column_types will be automatically inferred from the data. The current supported types are: - "text": each row in this column contains text (sentence, paragraph, etc.). - "numerical": each row in this column contains a number. - "categorical": each row in this column belongs to one of K categories. num_cpus : int, default = None The number of CPUs to use for each training run (i.e. one hyperparameter-tuning trial). num_gpus : int, default = None The number of GPUs to use to use for each training run (i.e. one hyperparameter-tuning trial). We recommend at least 1 GPU for TextPredictor as its neural network models are computationally intensive. num_trials : int, default = None If hyperparameter-tuning is used, specifies how many HPO trials should be run (assuming `time_limit` has not been exceeded). By default, this is the provided number of trials in the `hyperparameters` or `presets`. If specified here, this value will overwrite the value in `hyperparameters['tune_kwargs']['num_trials']`. plot_results : bool, default = None Whether to plot intermediate results from training. If None, will be decided based on the environment in which `fit()` is run. holdout_frac : float, default = None Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`). Default value (if None) is selected based on the number of rows in the training data and whether hyperparameter-tuning is utilized. seed : int, default = 0 The random seed to use for this training run. If None, no seed will be specified and repeated runs will produce different results. Returns ------- :class:`TextPredictor` object. Returns self. """ assert self._fit_called is False verbosity = self.verbosity if verbosity is None: verbosity = 3 if presets is not None: preset_hparams = ag_text_presets.create(presets) else: preset_hparams = ag_text_presets.create('default') hyperparameters = merge_params(preset_hparams, hyperparameters) if num_trials is not None: hyperparameters['tune_kwargs']['num_trials'] = num_trials if isinstance(self._label, str): label_columns = [self._label] else: label_columns = list(self._label) # Get the training and tuning data as pandas dataframe if isinstance(train_data, str): train_data = load_pd.load(train_data) if not isinstance(train_data, pd.DataFrame): raise AssertionError( f'train_data is required to be a pandas DataFrame, but was instead: {type(train_data)}' ) all_columns = list(train_data.columns) feature_columns = [ ele for ele in all_columns if ele not in label_columns ] train_data = train_data[all_columns] # Get tuning data if tuning_data is not None: if isinstance(tuning_data, str): tuning_data = load_pd.load(tuning_data) if not isinstance(tuning_data, pd.DataFrame): raise AssertionError( f'tuning_data is required to be a pandas DataFrame, but was instead: {type(tuning_data)}' ) tuning_data = tuning_data[all_columns] else: if holdout_frac is None: num_trials = hyperparameters['tune_kwargs']['num_trials'] if num_trials == 1: holdout_frac = default_holdout_frac(len(train_data), False) else: # For HPO, we will need to use a larger held-out ratio holdout_frac = default_holdout_frac(len(train_data), True) train_data, tuning_data = train_test_split( train_data, test_size=holdout_frac, random_state=np.random.RandomState(seed)) column_types, problem_type = infer_column_problem_types( train_data, tuning_data, label_columns=label_columns, problem_type=self._problem_type, provided_column_types=column_types) self._eval_metric, log_metrics = infer_eval_log_metrics( problem_type=problem_type, eval_metric=self._eval_metric) has_text_column = False for k, v in column_types.items(): if v == _C.TEXT: has_text_column = True break if not has_text_column: raise AssertionError( 'No Text Column is found! This is currently not supported by ' 'the TextPredictor. You may try to use ' 'autogluon.tabular.TabularPredictor.\n' 'The inferred column properties of the training data is {}'. format(column_types)) logger.info('Problem Type="{}"'.format(problem_type)) logger.info(printable_column_type_string(column_types)) self._problem_type = problem_type if 'models' not in hyperparameters or 'MultimodalTextModel' not in hyperparameters[ 'models']: raise ValueError( 'The current TextPredictor only supports "MultimodalTextModel" ' 'and you must ensure that ' 'hyperparameters["models"]["MultimodalTextModel"] can be accessed.' ) model_hparams = hyperparameters['models']['MultimodalTextModel'] self._backend = model_hparams['backend'] if plot_results is None: plot_results = in_ipynb() if self._backend == 'gluonnlp_v0': import warnings warnings.filterwarnings('ignore', module='mxnet') from ..mx.models import MultiModalTextModel self._model = MultiModalTextModel(column_types=column_types, feature_columns=feature_columns, label_columns=label_columns, problem_type=self._problem_type, eval_metric=self._eval_metric, log_metrics=log_metrics, output_directory=self._path) self._model.train(train_data=train_data, tuning_data=tuning_data, num_cpus=num_cpus, num_gpus=num_gpus, search_space=model_hparams['search_space'], tune_kwargs=hyperparameters['tune_kwargs'], time_limit=time_limit, seed=seed, plot_results=plot_results, verbosity=verbosity) else: raise NotImplementedError( "Currently, we only support using " "the autogluon-contrib-nlp and MXNet " "as the backend of AutoGluon-Text. In the future, " "we will support other models.") logger.info(f'Training completed. Auto-saving to "{self.path}". ' f'For loading the model, you can use' f' `predictor = TextPredictor.load("{self.path}")`') self.save(self.path) return self