示例#1
0
def test_cpu_only_raise(set_env_train_without_gpu):
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label='label', eval_metric='acc')
    if set_env_train_without_gpu is None:
        with pytest.raises(RuntimeError):
            predictor.fit(train_data,
                          hyperparameters=get_test_hyperparameters(),
                          num_gpus=0,
                          seed=123)
    elif set_env_train_without_gpu is True:
        os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '1'
        predictor.fit(train_data,
                      hyperparameters=get_test_hyperparameters(),
                      num_gpus=0,
                      time_limit=30,
                      seed=123)
        verify_predictor_save_load(predictor, dev_data, verify_proba=True)
    else:
        with pytest.raises(RuntimeError):
            predictor.fit(train_data,
                          hyperparameters=get_test_hyperparameters(),
                          num_gpus=0,
                          seed=123)
def test_cpu_only_raise():
    train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                              'glue/sst/train.parquet')
    dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                            'glue/sst/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    with pytest.raises(RuntimeError):
        predictor = task.fit(train_data, hyperparameters=test_hyperparameters,
                             label='label', num_trials=1,
                             ngpus_per_trial=0,
                             verbosity=4,
                             output_directory='./sst',
                             plot_results=False)
    os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '1'
    predictor = task.fit(train_data, hyperparameters=test_hyperparameters,
                         label='label', num_trials=1,
                         ngpus_per_trial=0,
                         verbosity=4,
                         output_directory='./sst',
                         plot_results=False)

    os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '0'
    with pytest.raises(RuntimeError):
        predictor = task.fit(train_data, hyperparameters=test_hyperparameters,
                             label='label', num_trials=1,
                             ngpus_per_trial=0,
                             verbosity=4,
                             output_directory='./sst',
                             plot_results=False)
示例#3
0
def test_predictor_fit(key):
    train_data = load_pd.load(DATA_INFO[key]['train'])
    dev_data = load_pd.load(DATA_INFO[key]['dev'])
    label = DATA_INFO[key]['label']
    eval_metric = DATA_INFO[key]['metric']
    verify_proba = DATA_INFO[key]['verify_proba']

    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label=label, eval_metric=eval_metric)
    predictor.fit(train_data,
                  hyperparameters=get_test_hyperparameters(),
                  time_limit=30,
                  seed=123)
    dev_score = predictor.evaluate(dev_data)
    verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)

    # Test for continuous fit
    predictor.fit(train_data,
                  hyperparameters=get_test_hyperparameters(),
                  time_limit=30,
                  seed=123)
    verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)

    # Saving to folder, loading the saved model and call fit again (continuous fit)
    with tempfile.TemporaryDirectory() as root:
        predictor.save(root)
        predictor = TextPredictor.load(root)
        predictor.fit(train_data,
                      hyperparameters=get_test_hyperparameters(),
                      time_limit=30,
                      seed=123)
def test_mixed_column_type():
    train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                              'glue/sts/train.parquet')
    dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                            'glue/sts/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]

    # Add more columns as feature
    train_data = pd.DataFrame({'sentence1': train_data['sentence1'],
                               'sentence2': train_data['sentence2'],
                               'sentence3': train_data['sentence2'],
                               'categorical0': train_data['genre'],
                               'numerical0': train_data['score'],
                               'genre': train_data['genre'],
                               'score': train_data['score']})
    dev_data = pd.DataFrame({'sentence1': dev_data['sentence1'],
                             'sentence2': dev_data['sentence2'],
                             'sentence3': dev_data['sentence2'],
                             'categorical0': dev_data['genre'],
                             'numerical0': dev_data['score'],
                             'genre': dev_data['genre'],
                             'score': dev_data['score']})
    # Train Regression
    predictor1 = task.fit(train_data,
                          hyperparameters=test_hyperparameters,
                          label='score', num_trials=1,
                          verbosity=4,
                          ngpus_per_trial=1,
                          output_directory='./sts_score',
                          plot_results=False)
    dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse'])
    verify_predictor_save_load(predictor1, dev_data)

    # Train Classification
    predictor2 = task.fit(train_data,
                          hyperparameters=test_hyperparameters,
                          label='genre', num_trials=1,
                          verbosity=4,
                          ngpus_per_trial=1,
                          output_directory='./sts_genre',
                          plot_results=False)
    dev_rmse = predictor2.evaluate(dev_data, metrics=['acc'])
    verify_predictor_save_load(predictor2, dev_data, verify_proba=True)

    # Specify the feature column
    predictor3 = task.fit(train_data,
                          hyperparameters=test_hyperparameters,
                          feature_columns=['sentence1', 'sentence3', 'categorical0'],
                          label='score', num_trials=1,
                          verbosity=4,
                          ngpus_per_trial=1,
                          output_directory='./sts_score',
                          plot_results=False)
    dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse'])
    verify_predictor_save_load(predictor3, dev_data)
示例#5
0
def train(args):
    set_seed(args.seed)
    if args.task is not None:
        feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task]
    else:
        raise NotImplementedError
    if args.exp_dir is None:
        args.exp_dir = 'autogluon_text_{}'.format(args.task)
    train_df = load_pd.load(args.train_file)
    dev_df = load_pd.load(args.dev_file)
    test_df = load_pd.load(args.test_file)
    train_df = train_df[feature_columns + [label_column]]
    dev_df = dev_df[feature_columns + [label_column]]
    test_df = test_df[feature_columns]
    if args.task == 'mrpc' or args.task == 'sts':
        # Augmenting the un-ordered set manually.
        train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]],
                                            feature_columns[1]: train_df[feature_columns[0]],
                                            label_column: train_df[label_column]})
        real_train_df = pd.concat([train_df, train_df_other_part])
        real_dev_df = dev_df
    else:
        real_train_df = train_df
        real_dev_df = dev_df
    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal',
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal')
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      seed=args.seed)
    else:
        raise NotImplementedError
    dev_metric_score = predictor.evaluate(dev_df)
    dev_predictions = predictor.predict(dev_df, as_pandas=True)
    test_predictions = predictor.predict(test_df, as_pandas=True)
    dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv'))
    test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
    with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of:
        json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
示例#6
0
def test_mixed_column_type():
    train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                              'glue/sts/train.parquet')
    dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                            'glue/sts/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:1000]]
    dev_data = dev_data.iloc[valid_perm[:10]]

    # Add more columns as feature
    train_data = pd.DataFrame({'sentence1': train_data['sentence1'],
                               'sentence2': train_data['sentence2'],
                               'sentence3': train_data['sentence2'],
                               'categorical0': train_data['genre'],
                               'numerical0': train_data['score'],
                               'genre': train_data['genre'],
                               'score': train_data['score']})
    dev_data = pd.DataFrame({'sentence1': dev_data['sentence1'],
                             'sentence2': dev_data['sentence2'],
                             'sentence3': dev_data['sentence2'],
                             'categorical0': dev_data['genre'],
                             'numerical0': dev_data['score'],
                             'genre': dev_data['genre'],
                             'score': dev_data['score']})
    # Train Regression
    predictor = TextPredictor(label='score', verbosity=4)
    predictor.fit(train_data,
                   hyperparameters=get_test_hyperparameters(),
                   time_limit=30,
                   seed=123)

    dev_rmse = predictor.evaluate(dev_data, metrics=['rmse'])
    verify_predictor_save_load(predictor, dev_data)

    # Train Classification
    predictor = TextPredictor(label='genre', verbosity=4)
    predictor.fit(train_data,
                   hyperparameters=get_test_hyperparameters(),
                   time_limit=30,
                   seed=123)

    dev_rmse = predictor.evaluate(dev_data, metrics=['acc'])
    verify_predictor_save_load(predictor, dev_data, verify_proba=True)

    # Specify the feature column
    predictor = TextPredictor(label='score', verbosity=4)
    predictor.fit(train_data[['sentence1', 'sentence3', 'categorical0', 'score']],
                   hyperparameters=get_test_hyperparameters(),
                   time_limit=30,
                   seed=123)
    dev_rmse = predictor.evaluate(dev_data, metrics=['rmse'])
    verify_predictor_save_load(predictor, dev_data)
示例#7
0
def test_cpu_only_warning():
    train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                              'glue/sst/train.parquet')
    dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                            'glue/sst/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label='label', eval_metric='acc')
    with pytest.warns(UserWarning):
        predictor.fit(train_data, hyperparameters=get_test_hyperparameters(),
                      num_gpus=0, seed=123)
示例#8
0
def test_sst(hyperparameters):
    train_data = load_pd.load(
        'https://autogluon-text-data.s3-accelerate.amazonaws.com/'
        'glue/sst/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text-data.s3-accelerate.amazonaws.com/'
        'glue/sst/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label='label', eval_metric='acc')
    predictor.fit(train_data, hyperparameters=hyperparameters)
    dev_acc = predictor.evaluate(dev_data, metrics=['acc'])
    verify_predictor_save_load(predictor, dev_data, verify_proba=True)
def test_no_job_finished_raise():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/dev.parquet')
    with pytest.raises(RuntimeError):
        # Setting a very small time limits to trigger the bug
        predictor = task.fit(train_data,
                             hyperparameters=test_hyperparameters,
                             label='label',
                             num_trials=1,
                             ngpus_per_trial=0,
                             verbosity=4,
                             time_limits=10,
                             output_directory='./sst_raise',
                             plot_results=False)
示例#10
0
def test_sts():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = task.fit(train_data, hyperparameters=test_hyperparameters,
                         label='score', num_trials=1,
                         verbosity=4,
                         ngpus_per_trial=1,
                         output_directory='./sts',
                         plot_results=False)
    dev_rmse = predictor.evaluate(dev_data, metrics=['rmse'])
    verify_predictor_save_load(predictor, dev_data)
示例#11
0
def test_empty_text_item():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    train_data = train_data.iloc[train_perm[:100]]
    train_data.iat[0, 0] = None
    train_data.iat[10, 0] = None
    predictor = TextPredictor(label='score', verbosity=4)
    predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30)
示例#12
0
def test_predictor_fit(key):
    train_data = load_pd.load(DATA_INFO[key]['train'])
    dev_data = load_pd.load(DATA_INFO[key]['dev'])
    label = DATA_INFO[key]['label']
    eval_metric = DATA_INFO[key]['metric']
    verify_proba = DATA_INFO[key]['verify_proba']

    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label=label, eval_metric=eval_metric)
    predictor.fit(train_data,
                  hyperparameters=get_test_hyperparameters(),
                  time_limit=30,
                  seed=123)
    dev_score = predictor.evaluate(dev_data)
    verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)
示例#13
0
    def __init__(self, path_or_df: Union[str, pd.DataFrame],
                 *,
                 columns=None,
                 label_columns=None,
                 column_metadata: Optional[Union[str, Dict]] = None,
                 column_properties: Optional[collections.OrderedDict] = None,
                 categorical_default_handle_missing_value=True):
        """

        Parameters
        ----------
        path_or_df
            The path or dataframe of the tabular dataset for NLP.
        columns
            The chosen columns to load the data
        label_columns
            The name of the label columns. This helps to infer the column properties.
        column_metadata
            The metadata object that describes the property of the columns in the dataset
        column_properties
            The given column properties
        categorical_default_handle_missing_value
            Whether to handle missing value in categorical columns by default
        """
        super().__init__()
        if isinstance(path_or_df, pd.DataFrame):
            df = path_or_df
        else:
            df = load_pd.load(path_or_df)
        if columns is not None:
            if not isinstance(columns, list):
                columns = [columns]
            df = df[columns]
        df = normalize_df(df)
        if column_metadata is None:
            column_metadata = dict()
        elif isinstance(column_metadata, str):
            with open(column_metadata, 'r') as f:
                column_metadata = json.load(f)
        # Inference the column properties
        column_properties = get_column_properties(
            df,
            metadata=column_metadata,
            label_columns=label_columns,
            provided_column_properties=column_properties,
            categorical_default_handle_missing_value=categorical_default_handle_missing_value)
        for col_name, prop in column_properties.items():
            if prop.type == _C.TEXT:
                df[col_name] = df[col_name].fillna('').apply(str)
            elif prop.type == _C.NUMERICAL:
                df[col_name] = df[col_name].fillna(-1).apply(np.array)
        self._table = df
        self._column_properties = column_properties
示例#14
0
def test_no_job_finished_raise():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/train.parquet')
    with pytest.raises(RuntimeError):
        # Setting a very small time limits to trigger the bug
        predictor = TextPredictor(label='label')
        predictor.fit(train_data,
                      hyperparameters=get_test_hyperparameters(),
                      time_limit=1,
                      num_gpus=1,
                      seed=123)
def test_mrpc():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/mrpc/train.parquet'
    )
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/mrpc/dev.parquet'
    )
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = task.fit(train_data,
                         hyperparameters=test_hyperparameters,
                         label='label',
                         num_trials=1,
                         verbosity=4,
                         ngpus_per_trial=1,
                         output_directory='./mrpc',
                         plot_results=False)
    dev_acc = predictor.evaluate(dev_data, metrics=['acc'])
    dev_prediction = predictor.predict(dev_data)
    dev_pred_prob = predictor.predict_proba(dev_data)
示例#16
0
def test_empty_text_item():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    train_data = train_data.iloc[train_perm[:100]]
    train_data.iat[0, 0] = None
    train_data.iat[10, 0] = None
    predictor = task.fit(train_data, hyperparameters=test_hyperparameters,
                         label='score', num_trials=1,
                         ngpus_per_trial=0,
                         verbosity=4,
                         output_directory='./sts_empty_text_item',
                         plot_results=False)
示例#17
0
def test_preprocessor(dataset_name, url, label_column, backbone_name,
                      all_to_text):
    all_df = load_pd.load(url)
    feature_columns = [col for col in all_df.columns if col != label_column]
    train_df, valid_df = train_test_split(
        all_df, test_size=0.1, random_state=np.random.RandomState(100))
    column_types, problem_type = infer_column_problem_types(
        train_df, valid_df, label_columns=label_column)
    cfg = base_preprocess_cfg()
    if all_to_text:
        cfg.defrost()
        cfg.categorical.convert_to_text = True
        cfg.numerical.convert_to_text = True
        cfg.freeze()
    preprocessor = MultiModalTextFeatureProcessor(column_types=column_types,
                                                  label_column=label_column,
                                                  tokenizer_name=backbone_name,
                                                  cfg=cfg)
    train_dataset = preprocessor.fit_transform(train_df[feature_columns],
                                               train_df[label_column])
    train_dataset_after_transform = preprocessor.transform(
        train_df[feature_columns], train_df[label_column])
    for i in range(len(train_dataset)):
        for j in range(len(train_dataset[0])):
            npt.assert_allclose(train_dataset[i][j],
                                train_dataset_after_transform[i][j], 1E-4,
                                1E-4)
    valid_dataset = preprocessor.transform(valid_df[feature_columns],
                                           valid_df[label_column])
    test_dataset = preprocessor.transform(valid_df[feature_columns])
    assert_dataset_match(train_dataset, train_dataset_after_transform)
    for i in range(len(test_dataset)):
        for j in range(len(test_dataset[0])):
            npt.assert_allclose(valid_dataset[i][j], test_dataset[i][j], 1E-4,
                                1E-4)
    # Test for pickle dump and load
    with tempfile.TemporaryDirectory() as tmp_dir_name:
        with open(os.path.join(tmp_dir_name, 'preprocessor.pkl'),
                  'wb') as out_f:
            pickle.dump(preprocessor, out_f)
        with open(os.path.join(tmp_dir_name, 'preprocessor.pkl'),
                  'rb') as in_f:
            preprocessor_loaded = pickle.load(in_f)
        valid_dataset_loaded = preprocessor_loaded.transform(
            valid_df[feature_columns], valid_df[label_column])
        assert_dataset_match(valid_dataset_loaded, valid_dataset)
        test_dataset_loaded = preprocessor_loaded.transform(
            valid_df[feature_columns])
        assert_dataset_match(test_dataset_loaded, test_dataset)
示例#18
0
 def __init__(self, *args, **kwargs):
     file_path = kwargs.get('file_path', None)
     name = kwargs.get('name', None)
     feature_types = kwargs.get('feature_types', None)
     df = kwargs.get('df', None)
     subsample = kwargs.get('subsample', None)
     copy = kwargs.get('copy', False)
     construct_from_df = False  # whether or not we are constructing new dataset object from scratch based on provided DataFrame.
     # if df is None and file_path is None: # Cannot be used currently!
     #     raise ValueError("Must specify either named argument 'file_path' or 'df' in order to construct tabular Dataset")
     if df is not None:  # Create Dataset from existing Python DataFrame:
         construct_from_df = True
         if not isinstance(df, pd.DataFrame):
             raise ValueError(
                 "'df' must be existing pandas DataFrame. To read dataset from file instead, use 'file_path' string argument."
             )
         if file_path is not None:
             warnings.warn(
                 "Both 'df' and 'file_path' supplied. Creating dataset based on DataFrame 'df' rather than reading from file_path."
             )
         if copy:
             df = df.copy(deep=True)
     elif file_path is not None:  # Read from file to create dataset
         construct_from_df = True
         df = load_pd.load(file_path)
     if construct_from_df:  # Construct new Dataset object based off of DataFrame
         if subsample is not None:
             if not isinstance(subsample, int) or subsample <= 1:
                 raise ValueError("'subsample' must be of type int and > 1")
             df = df.head(subsample)
         super().__init__(df)
         self.file_path = file_path
         self.name = name
         self.feature_types = feature_types
         self.subsample = subsample
     else:
         super().__init__(*args, **kwargs)
    def fit(self,
            train_data,
            tuning_data=None,
            time_limit=None,
            presets=None,
            hyperparameters=None,
            feature_columns=None,
            column_types=None,
            num_cpus=None,
            num_gpus=None,
            num_trials=None,
            seed=None):
        """Fit the predictor

        Parameters
        ----------
        train_data
            The training data
        tuning_data
            The tuning data
        time_limit
            The time limits
        presets
            The user can specify the presets of the hyper-parameters.
        hyperparameters
            The hyper-parameters
        feature_columns
            Specify which columns in the data
        column_types
            The provided type of the columns
        num_cpus
            The number of CPUs to use for each trial
        num_gpus
            The number of GPUs to use for each trial
        num_trials
            The number of trials. By default, we will use the provided number of trials in the
            hyperparameters or presets. This will overwrite the provided value.
        seed
            The seed of the experiment

        Returns
        -------
        self
        """
        assert self._fit_called is False
        if presets is not None:
            preset_hparams = ag_text_presets.create(presets)
        else:
            preset_hparams = ag_text_presets.create('default')
        hyperparameters = merge_params(preset_hparams, hyperparameters)
        if seed is not None:
            hyperparameters['seed'] = seed
        seed = hyperparameters['seed']
        if num_trials is not None:
            hyperparameters['hpo_params']['num_trials'] = num_trials
        if isinstance(self._label, str):
            label_columns = [self._label]
        else:
            label_columns = list(self._label)
        # Get the training and tuning data as pandas dataframe
        if not isinstance(train_data, pd.DataFrame):
            train_data = load_pd.load(train_data)
        if feature_columns is None:
            all_columns = list(train_data.columns)
            feature_columns = [
                ele for ele in all_columns if ele not in label_columns
            ]
        else:
            if isinstance(feature_columns, str):
                feature_columns = [feature_columns]
            for col in feature_columns:
                assert col not in label_columns, 'Feature columns and label columns cannot overlap.'
                assert col in train_data.columns,\
                    'Feature columns must be in the pandas dataframe! Received col = "{}", ' \
                    'all columns = "{}"'.format(col, train_data.columns)
            all_columns = feature_columns + label_columns
        train_data = train_data[all_columns]
        # Get tuning data
        if tuning_data is not None:
            if not isinstance(tuning_data, pd.DataFrame):
                tuning_data = load_pd.load(tuning_data)
            tuning_data = tuning_data[all_columns]
        else:
            if hyperparameters['misc']['holdout_frac'] is not None:
                holdout_frac = hyperparameters['misc']['holdout_frac']
            else:
                num_trials = hyperparameters['hpo_params']['num_trials']
                if num_trials == 1:
                    holdout_frac = default_holdout_frac(len(train_data), False)
                else:
                    # For HPO, we will need to use a larger held-out ratio
                    holdout_frac = default_holdout_frac(len(train_data), True)
            train_data, tuning_data = train_test_split(
                train_data,
                test_size=holdout_frac,
                random_state=np.random.RandomState(seed))
        column_types, problem_type = infer_column_problem_types(
            train_data,
            tuning_data,
            label_columns=label_columns,
            problem_type=self._problem_type,
            provided_column_types=column_types)
        self._eval_metric, log_metrics = infer_eval_log_metrics(
            problem_type=problem_type, eval_metric=self._eval_metric)
        has_text_column = False
        for k, v in column_types.items():
            if v == _C.TEXT:
                has_text_column = True
                break
        if not has_text_column:
            raise AssertionError(
                'No Text Column is found! This is currently not supported by '
                'the TextPrediction task. You may try to use '
                'autogluon.tabular.TabularPredictor.\n'
                'The inferred column properties of the training data is {}'.
                format(train_data))
        logger.log(25, 'Problem Type="{}"'.format(problem_type))
        logger.log(25, printable_column_type_string(column_types))
        self._problem_type = problem_type
        model_hparams = hyperparameters['models']['MultimodalTextModel']
        self._backend = model_hparams['backend']
        if model_hparams['backend'] == 'gluonnlp_v0':
            from ..mx.models import MultiModalTextModel
            self._model = MultiModalTextModel(column_types=column_types,
                                              feature_columns=feature_columns,
                                              label_columns=label_columns,
                                              problem_type=self._problem_type,
                                              eval_metric=self._eval_metric,
                                              log_metrics=log_metrics,
                                              output_directory=self._path)
            self._model.train(train_data=train_data,
                              tuning_data=tuning_data,
                              num_cpus=num_cpus,
                              num_gpus=num_gpus,
                              search_space=model_hparams['search_space'],
                              hpo_params=hyperparameters['hpo_params'],
                              time_limit=time_limit,
                              seed=seed,
                              verbosity=self.verbosity)
        else:
            raise NotImplementedError(
                "Currently, we only support using "
                "the autogluon-contrib-nlp and MXNet "
                "as the backend of AutoGluon-Text. In the future, "
                "we will support other models.")
        return self
示例#20
0
def test_multimodal_batchify(dataset_name, url, label_column,
                             backbone_name, all_to_text, insert_sep,
                             stochastic_chunk):
    # Test for multimodal batchify
    all_df = load_pd.load(url)
    feature_columns = [col for col in all_df.columns if col != label_column]
    train_df, valid_df = train_test_split(all_df, test_size=0.1,
                                          random_state=np.random.RandomState(100))
    column_types, problem_type = infer_column_problem_types(train_df, valid_df,
                                                            label_columns=label_column)
    cfg = base_preprocess_cfg()
    if all_to_text:
        cfg.defrost()
        cfg.categorical.convert_to_text = True
        cfg.numerical.convert_to_text = True
        cfg.freeze()

    preprocessor = MultiModalTextFeatureProcessor(column_types=column_types,
                                                  label_column=label_column,
                                                  tokenizer_name=backbone_name,
                                                  cfg=cfg)
    cls_id, sep_id = get_cls_sep_id(preprocessor.tokenizer)
    train_dataset = preprocessor.fit_transform(train_df[feature_columns], train_df[label_column])
    test_dataset = preprocessor.transform(valid_df[feature_columns])
    auto_max_length = auto_shrink_max_length(train_dataset=train_dataset,
                                             insert_sep=insert_sep,
                                             num_text_features=len(preprocessor.text_feature_names),
                                             auto_max_length_quantile=0.9,
                                             round_to=32,
                                             max_length=512)
    train_batchify_fn = MultiModalTextBatchify(num_text_inputs=len(preprocessor.text_feature_names),
                                               num_categorical_inputs=len(preprocessor.categorical_feature_names),
                                               num_numerical_inputs=len(preprocessor.numerical_feature_names) > 0,
                                               cls_token_id=cls_id,
                                               sep_token_id=sep_id,
                                               max_length=auto_max_length,
                                               mode='train',
                                               stochastic_chunk=stochastic_chunk,
                                               insert_sep=insert_sep)
    test_batchify_fn = MultiModalTextBatchify(num_text_inputs=len(preprocessor.text_feature_names),
                                              num_categorical_inputs=len(preprocessor.categorical_feature_names),
                                              num_numerical_inputs=len(preprocessor.numerical_feature_names) > 0,
                                              cls_token_id=cls_id,
                                              sep_token_id=sep_id,
                                              max_length=auto_max_length,
                                              mode='test',
                                              stochastic_chunk=stochastic_chunk,
                                              insert_sep=insert_sep)
    train_dataloader = DataLoader(train_dataset, batch_size=4,
                                  batchify_fn=train_batchify_fn, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=4,
                                 batchify_fn=test_batchify_fn, shuffle=False)
    for sample in train_dataloader:
        features, label = sample[0], sample[1]
        assert len(features) == train_batchify_fn.num_text_outputs + \
               train_batchify_fn.num_categorical_outputs + train_batchify_fn.num_numerical_outputs
        text_token_ids, text_valid_length, text_segment_ids = features[0]
        assert text_token_ids.shape[1] <= auto_max_length
        assert text_segment_ids.shape[1] <= auto_max_length
        assert text_token_ids.shape == text_segment_ids.shape
    for sample in test_dataloader:
        assert len(sample) == test_batchify_fn.num_text_outputs + \
               test_batchify_fn.num_categorical_outputs + test_batchify_fn.num_numerical_outputs
        text_token_ids, text_valid_length, text_segment_ids = sample[0]
        assert text_token_ids.shape[1] <= auto_max_length
        assert text_segment_ids.shape[1] <= auto_max_length
        assert text_token_ids.shape == text_segment_ids.shape
示例#21
0
def test_distillation():
    train_data = load_pd.load(
        "https://autogluon-text.s3-accelerate.amazonaws.com/"
        "glue/sst/train.parquet")
    test_data = load_pd.load(
        "https://autogluon-text.s3-accelerate.amazonaws.com/"
        "glue/sst/dev.parquet")
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    test_perm = rng_state.permutation(len(test_data))
    train_data = train_data.iloc[train_perm[:100]]
    test_data = test_data.iloc[test_perm[:10]]

    teacher_predictor = TextPredictor(label="label", eval_metric="acc")

    hyperparameters = {
        "model.hf_text.checkpoint_name": "prajjwal1/bert-tiny",
        "env.num_workers": 0,
        "env.num_workers_evaluation": 0,
    }

    teacher_save_path = os.path.join("sst", "teacher")
    if os.path.exists(teacher_save_path):
        shutil.rmtree(teacher_save_path)

    teacher_predictor = teacher_predictor.fit(
        train_data=train_data,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=teacher_save_path,
    )

    # test for distillation
    predictor = TextPredictor(label="label", eval_metric="acc")

    student_save_path = os.path.join("sst", "student")
    if os.path.exists(student_save_path):
        shutil.rmtree(student_save_path)

    predictor = predictor.fit(
        train_data=train_data,
        teacher_predictor=teacher_predictor,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=student_save_path,
    )
    verify_predictor_save_load(predictor, test_data)

    # test for distillation with teacher predictor path
    predictor = TextPredictor(label="label", eval_metric="acc")

    student_save_path = os.path.join("sst", "student")
    if os.path.exists(student_save_path):
        shutil.rmtree(student_save_path)

    predictor = predictor.fit(
        train_data=train_data,
        teacher_predictor=teacher_predictor.path,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=student_save_path,
    )
    verify_predictor_save_load(predictor, test_data)
示例#22
0
    def fit(cls,
            train_data,
            label,
            tuning_data=None,
            time_limits=None,
            output_directory='./ag_text',
            feature_columns=None,
            holdout_frac=None,
            eval_metric=None,
            stopping_metric=None,
            nthreads_per_trial=None,
            ngpus_per_trial=None,
            dist_ip_addrs=None,
            num_trials=None,
            search_strategy=None,
            search_options=None,
            scheduler_options=None,
            hyperparameters=None,
            plot_results=None,
            seed=None,
            verbosity=2):
        """Fit models to make predictions based on text inputs.

        Parameters
        ----------
        train_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame`
            Training dataset where rows = individual training examples, columns = features.
        label : str
            Name of the label column. It can be a stringBy default, we will search for a column named
        tuning_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame`, default = None
            Another dataset containing validation data reserved for hyperparameter tuning (in same format as training data).
            If `tuning_data = None`, `fit()` will automatically hold out random examples from `train_data` for validation.
        time_limits : int or str, default = None
            Approximately how long `fit()` should run for (wallclock time in seconds if int).
            String values may instead be used to specify time in different units such as: '1min' or '1hour'.
            Longer `time_limits` will usually improve predictive accuracy.
            If not specified, `fit()` will run until all models to try by default have completed training.
        output_directory : str, default = './ag_text'
            Path to directory where models and intermediate outputs should be saved.
        feature_columns : List[str], default = None
            Which columns of table to consider as predictive features (other columns will be ignored, except for label-column).
            If None (by default), all columns of table are considered predictive features.
        holdout_frac : float, default = None
            Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`).
            If None, default value is selected based on the number of training examples.
        eval_metric : str, default = None
            The evaluation metric that will be used to evaluate the model's predictive performance.
            If None, an appropriate default metric will be selected (accuracy for classification, mean-squared-error for regression).
            Options for classification include: 'acc' (accuracy), 'nll' (negative log-likelihood).
            Additional options for binary classification include: 'f1' (F1 score), 'mcc' (Matthews coefficient), 'auc' (area under ROC curve).
            Options for regression include: 'mse' (mean squared error), 'rmse' (root mean squared error), 'mae' (mean absolute error).
        stopping_metric, default = None
            Metric which iteratively-trained models use to early stop to avoid overfitting.
            Defaults to `eval_metric` value (if None).
            Options are identical to options for `eval_metric`.
        nthreads_per_trial, default = None
            The number of threads per individual model training run. By default, all available CPUs are used.
        ngpus_per_trial, default = None
            The number of GPUs to use per individual model training run. If unspecified, a default value is chosen based on total number of GPUs available.
        dist_ip_addrs, default = None
            List of IP addresses corresponding to remote workers, in order to leverage distributed computation.
        num_trials : , default = None
            The number of trials in the HPO search
        search_strategy : str, default = None
            Which hyperparameter search algorithm to use. Options include:
            'random' (random search), 'bayesopt' (Gaussian process Bayesian optimization),
            'skopt' (SKopt Bayesian optimization), 'grid' (grid search),
            'hyperband' (Hyperband scheduling with random search), 'bayesopt-hyperband'
            (Hyperband scheduling with GP-BO search).
            If unspecified, the default is 'random'.
        search_options : dict, default = None
            Options passed to searcher.
        scheduler_options : dict, default = None
            Additional kwargs passed to scheduler __init__.
        hyperparameters : dict, default = None
            Determines the hyperparameters used by the models. Each hyperparameter may be either fixed value or search space of many values.
            For example of default hyperparameters, see: `autogluon.task.text_prediction.text_prediction.default()`
        plot_results : bool, default = None
            Whether or not to plot intermediate training results during `fit()`.
        seed : int, default = None
            Seed value for random state used inside `fit()`. 
        verbosity : int, default = 2
            Verbosity levels range from 0 to 4 and control how much information is printed
            during fit().
            Higher levels correspond to more detailed print statements
            (you can set verbosity = 0 to suppress warnings).
            If using logging, you can alternatively control amount of information printed
            via `logger.setLevel(L)`,
            where `L` ranges from 0 to 50 (Note: higher values of `L` correspond to fewer print
            statements, opposite of verbosity levels)

        Returns
        -------
        model
            A `BertForTextPredictionBasic` object that can be used for making predictions on new data.
        """
        assert dist_ip_addrs is None, 'Training on remote machine is currently not supported.'
        # Version check of MXNet
        if version.parse(mxnet.__version__) < version.parse('1.7.0') \
                or version.parse(mxnet.__version__) >= version.parse('2.0.0'):
            raise ImportError(
                'You will need to ensure that you have mxnet>=1.7.0, <2.0.0. '
                'For more information about how to install mxnet, you can refer to '
                'https://sxjscience.github.io/KDD2020/ .')

        if verbosity < 0:
            verbosity = 0
        elif verbosity > 4:
            verbosity = 4
        console_log = verbosity >= 2
        logging_config(folder=output_directory,
                       name='ag_text_prediction',
                       logger=logger,
                       level=verbosity2loglevel(verbosity),
                       console=console_log)
        # Parse the hyper-parameters
        if hyperparameters is None:
            hyperparameters = ag_text_prediction_params.create('default')
        elif isinstance(hyperparameters, str):
            hyperparameters = ag_text_prediction_params.create(hyperparameters)
        else:
            base_params = ag_text_prediction_params.create('default')
            hyperparameters = merge_params(base_params, hyperparameters)
        np.random.seed(seed)
        if not isinstance(train_data, pd.DataFrame):
            train_data = load_pd.load(train_data)
        # Inference the label
        if not isinstance(label, list):
            label = [label]
        label_columns = []
        for ele in label:
            if isinstance(ele, int):
                label_columns.append(train_data.columns[ele])
            else:
                label_columns.append(ele)
        if feature_columns is None:
            all_columns = list(train_data.columns)
            feature_columns = [
                ele for ele in all_columns if ele not in label_columns
            ]
        else:
            if isinstance(feature_columns, str):
                feature_columns = [feature_columns]
            for col in feature_columns:
                assert col not in label_columns, 'Feature columns and label columns cannot overlap.'
                assert col in train_data.columns,\
                    'Feature columns must be in the pandas dataframe! Received col = "{}", ' \
                    'all columns = "{}"'.format(col, train_data.columns)
            all_columns = feature_columns + label_columns
            all_columns = [
                ele for ele in train_data.columns if ele in all_columns
            ]
        if tuning_data is None:
            if holdout_frac is None:
                holdout_frac = default_holdout_frac(len(train_data), True)
            train_data, tuning_data = random_split_train_val(
                train_data, valid_ratio=holdout_frac)
        else:
            if not isinstance(tuning_data, pd.DataFrame):
                tuning_data = load_pd.load(tuning_data)
        train_data = train_data[all_columns]
        tuning_data = tuning_data[all_columns]
        column_properties = get_column_properties(
            pd.concat([train_data, tuning_data]),
            metadata=None,
            label_columns=label_columns,
            provided_column_properties=None,
            categorical_default_handle_missing_value=True)
        train_data = TabularDataset(train_data,
                                    column_properties=column_properties,
                                    label_columns=label_columns)
        tuning_data = TabularDataset(
            tuning_data,
            column_properties=train_data.column_properties,
            label_columns=label_columns)

        logger.info('Train Dataset:')
        logger.info(train_data)
        logger.info('Tuning Dataset:')
        logger.info(tuning_data)
        logger.debug('Hyperparameters:')
        logger.debug(hyperparameters)
        has_text_column = False
        for k, v in column_properties.items():
            if v.type == _C.TEXT:
                has_text_column = True
                break
        if not has_text_column:
            raise NotImplementedError('No Text Column is found! This is currently not supported by '
                                      'the TextPrediction task. You may try to use '
                                      'TabularPrediction.fit().\n' \
                                      'The inferred column properties of the training data is {}'
                                      .format(train_data))
        problem_types = []
        label_shapes = []
        for label_col_name in label_columns:
            problem_type, label_shape = infer_problem_type(
                column_properties=column_properties,
                label_col_name=label_col_name)
            problem_types.append(problem_type)
            label_shapes.append(label_shape)
        logging.info(
            'Label columns={}, Feature columns={}, Problem types={}, Label shapes={}'
            .format(label_columns, feature_columns, problem_types,
                    label_shapes))
        eval_metric, stopping_metric, log_metrics =\
            infer_eval_stop_log_metrics(problem_types[0],
                                        label_shapes[0],
                                        eval_metric=eval_metric,
                                        stopping_metric=stopping_metric)
        logging.info('Eval Metric={}, Stop Metric={}, Log Metrics={}'.format(
            eval_metric, stopping_metric, log_metrics))
        model_candidates = []
        for model_type, kwargs in hyperparameters['models'].items():
            search_space = kwargs['search_space']
            if model_type == 'BertForTextPredictionBasic':
                model = BertForTextPredictionBasic(
                    column_properties=column_properties,
                    label_columns=label_columns,
                    feature_columns=feature_columns,
                    label_shapes=label_shapes,
                    problem_types=problem_types,
                    stopping_metric=stopping_metric,
                    log_metrics=log_metrics,
                    base_config=None,
                    search_space=search_space,
                    output_directory=output_directory,
                    logger=logger)
                model_candidates.append(model)
            else:
                raise ValueError(
                    'model_type = "{}" is not supported. You can try to use '
                    'model_type = "BertForTextPredictionBasic"'.format(
                        model_type))
        assert len(
            model_candidates) == 1, 'Only one model is supported currently'
        recommended_resource = get_recommended_resource(
            nthreads_per_trial=nthreads_per_trial,
            ngpus_per_trial=ngpus_per_trial)
        if search_strategy is None:
            search_strategy = hyperparameters['hpo_params']['search_strategy']
        if time_limits is None:
            time_limits = hyperparameters['hpo_params']['time_limits']
        else:
            if isinstance(time_limits, str):
                if time_limits.endswith('min'):
                    time_limits = int(float(time_limits[:-3]) * 60)
                elif time_limits.endswith('hour'):
                    time_limits = int(float(time_limits[:-4]) * 60 * 60)
                else:
                    raise ValueError(
                        'The given time_limits="{}" cannot be parsed!'.format(
                            time_limits))
        if num_trials is None:
            num_trials = hyperparameters['hpo_params']['num_trials']
        if scheduler_options is None:
            scheduler_options = hyperparameters['hpo_params'][
                'scheduler_options']
            if scheduler_options is None:
                scheduler_options = dict()
        if search_strategy.endswith('hyperband'):
            # Specific defaults for hyperband scheduling
            scheduler_options['reduction_factor'] = scheduler_options.get(
                'reduction_factor', 4)
            scheduler_options['grace_period'] = scheduler_options.get(
                'grace_period', 10)
            scheduler_options['max_t'] = scheduler_options.get('max_t', 50)

        if recommended_resource['num_gpus'] == 0:
            warnings.warn(
                'Recommend to use GPU to run the TextPrediction task!')
        model = model_candidates[0]
        if plot_results is None:
            if in_ipynb():
                plot_results = True
            else:
                plot_results = False
        model.train(train_data=train_data,
                    tuning_data=tuning_data,
                    resource=recommended_resource,
                    time_limits=time_limits,
                    search_strategy=search_strategy,
                    search_options=search_options,
                    scheduler_options=scheduler_options,
                    num_trials=num_trials,
                    plot_results=plot_results,
                    console_log=verbosity > 2,
                    ignore_warning=verbosity <= 2)
        return model
示例#23
0
    def fit(self,
            train_data,
            tuning_data=None,
            time_limit=None,
            presets=None,
            hyperparameters=None,
            column_types=None,
            num_cpus=None,
            num_gpus=None,
            num_trials=None,
            plot_results=None,
            holdout_frac=None,
            seed=0):
        """
        Fit Transformer models to predict label column of a data table based on the other columns (which may contain text or numeric/categorical features).

        Parameters
        ----------
        train_data : str or :class:`TabularDataset` or :class:`pd.DataFrame`
            Table of the training data, which is similar to a pandas DataFrame.
            If str is passed, `train_data` will be loaded using the str value as the file path.
        tuning_data : str or :class:`TabularDataset` or :class:`pd.DataFrame`, default = None
            Another dataset containing validation data reserved for tuning processes such as early stopping and hyperparameter tuning.
            This dataset should be in the same format as `train_data`.
            If str is passed, `tuning_data` will be loaded using the str value as the file path.
            Note: final model returned may be fit on `tuning_data` as well as `train_data`. Do not provide your evaluation test data here!
            If `tuning_data = None`, `fit()` will automatically hold out some random validation examples from `train_data`.
        time_limit : int, default = None
            Approximately how long `fit()` should run for (wallclock time in seconds).
            If not specified, `fit()` will run until the model has completed training.
        presets : str, default = None
            Presets are pre-registered configurations that control training (hyperparameters and other aspects).
            It is recommended to specify presets and avoid specifying most other `fit()` arguments or model hyperparameters prior to becoming familiar with AutoGluon.
            Print all available presets via `autogluon.text.list_presets()`.
            Some notable presets include:
                - "best_quality": produce the most accurate overall predictor (regardless of its efficiency).
                - "medium_quality_faster_train": produce an accurate predictor but take efficiency into account (this is the default preset).
                - "lower_quality_fast_train": produce a predict that is quick to train and make predictions with, even if its accuracy is worse.
        hyperparameters : dict, default = None
            The hyperparameters of the `fit()` function, which affect the resulting accuracy of the trained predictor.
            Experienced AutoGluon users can use this argument to specify neural network hyperparameter values/search-spaces as well as which hyperparameter-tuning strategy should be employed. See the "Text Prediction" tutorials for examples.
        column_types : dict, default = None
            The type of data in each table column can be specified via a dictionary that maps the column name to its data type.
            For example: `column_types = {"item_name": "text", "brand": "text", "product_description": "text", "height": "numerical"}` may be used for a table with columns: "item_name", "brand", "product_description", and "height".
            If None, column_types will be automatically inferred from the data.
            The current supported types are:
            - "text": each row in this column contains text (sentence, paragraph, etc.).
            - "numerical": each row in this column contains a number.
            - "categorical": each row in this column belongs to one of K categories.
        num_cpus : int, default = None
            The number of CPUs to use for each training run (i.e. one hyperparameter-tuning trial).
        num_gpus : int, default = None
            The number of GPUs to use to use for each training run (i.e. one hyperparameter-tuning trial). We recommend at least 1 GPU for TextPredictor as its neural network models are computationally intensive.
        num_trials : int, default = None
            If hyperparameter-tuning is used, specifies how many HPO trials should be run (assuming `time_limit` has not been exceeded).
            By default, this is the provided number of trials in the `hyperparameters` or `presets`.
            If specified here, this value will overwrite the value in `hyperparameters['tune_kwargs']['num_trials']`.
        plot_results : bool, default = None
            Whether to plot intermediate results from training. If None, will be decided based on the environment in which `fit()` is run.
        holdout_frac : float, default = None
            Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`).
            Default value (if None) is selected based on the number of rows in the training data and whether hyperparameter-tuning is utilized.
        seed : int, default = 0
            The random seed to use for this training run. If None, no seed will be specified and repeated runs will produce different results.

        Returns
        -------
        :class:`TextPredictor` object. Returns self.
        """
        assert self._fit_called is False
        verbosity = self.verbosity
        if verbosity is None:
            verbosity = 3
        if presets is not None:
            preset_hparams = ag_text_presets.create(presets)
        else:
            preset_hparams = ag_text_presets.create('default')
        hyperparameters = merge_params(preset_hparams, hyperparameters)
        if num_trials is not None:
            hyperparameters['tune_kwargs']['num_trials'] = num_trials
        if isinstance(self._label, str):
            label_columns = [self._label]
        else:
            label_columns = list(self._label)
        # Get the training and tuning data as pandas dataframe
        if isinstance(train_data, str):
            train_data = load_pd.load(train_data)
        if not isinstance(train_data, pd.DataFrame):
            raise AssertionError(
                f'train_data is required to be a pandas DataFrame, but was instead: {type(train_data)}'
            )
        all_columns = list(train_data.columns)
        feature_columns = [
            ele for ele in all_columns if ele not in label_columns
        ]
        train_data = train_data[all_columns]
        # Get tuning data
        if tuning_data is not None:
            if isinstance(tuning_data, str):
                tuning_data = load_pd.load(tuning_data)
            if not isinstance(tuning_data, pd.DataFrame):
                raise AssertionError(
                    f'tuning_data is required to be a pandas DataFrame, but was instead: {type(tuning_data)}'
                )
            tuning_data = tuning_data[all_columns]
        else:
            if holdout_frac is None:
                num_trials = hyperparameters['tune_kwargs']['num_trials']
                if num_trials == 1:
                    holdout_frac = default_holdout_frac(len(train_data), False)
                else:
                    # For HPO, we will need to use a larger held-out ratio
                    holdout_frac = default_holdout_frac(len(train_data), True)
            train_data, tuning_data = train_test_split(
                train_data,
                test_size=holdout_frac,
                random_state=np.random.RandomState(seed))
        column_types, problem_type = infer_column_problem_types(
            train_data,
            tuning_data,
            label_columns=label_columns,
            problem_type=self._problem_type,
            provided_column_types=column_types)
        self._eval_metric, log_metrics = infer_eval_log_metrics(
            problem_type=problem_type, eval_metric=self._eval_metric)
        has_text_column = False
        for k, v in column_types.items():
            if v == _C.TEXT:
                has_text_column = True
                break
        if not has_text_column:
            raise AssertionError(
                'No Text Column is found! This is currently not supported by '
                'the TextPredictor. You may try to use '
                'autogluon.tabular.TabularPredictor.\n'
                'The inferred column properties of the training data is {}'.
                format(column_types))
        logger.info('Problem Type="{}"'.format(problem_type))
        logger.info(printable_column_type_string(column_types))
        self._problem_type = problem_type
        if 'models' not in hyperparameters or 'MultimodalTextModel' not in hyperparameters[
                'models']:
            raise ValueError(
                'The current TextPredictor only supports "MultimodalTextModel" '
                'and you must ensure that '
                'hyperparameters["models"]["MultimodalTextModel"] can be accessed.'
            )
        model_hparams = hyperparameters['models']['MultimodalTextModel']
        self._backend = model_hparams['backend']
        if plot_results is None:
            plot_results = in_ipynb()
        if self._backend == 'gluonnlp_v0':
            import warnings
            warnings.filterwarnings('ignore', module='mxnet')
            from ..mx.models import MultiModalTextModel
            self._model = MultiModalTextModel(column_types=column_types,
                                              feature_columns=feature_columns,
                                              label_columns=label_columns,
                                              problem_type=self._problem_type,
                                              eval_metric=self._eval_metric,
                                              log_metrics=log_metrics,
                                              output_directory=self._path)
            self._model.train(train_data=train_data,
                              tuning_data=tuning_data,
                              num_cpus=num_cpus,
                              num_gpus=num_gpus,
                              search_space=model_hparams['search_space'],
                              tune_kwargs=hyperparameters['tune_kwargs'],
                              time_limit=time_limit,
                              seed=seed,
                              plot_results=plot_results,
                              verbosity=verbosity)
        else:
            raise NotImplementedError(
                "Currently, we only support using "
                "the autogluon-contrib-nlp and MXNet "
                "as the backend of AutoGluon-Text. In the future, "
                "we will support other models.")
        logger.info(f'Training completed. Auto-saving to "{self.path}". '
                    f'For loading the model, you can use'
                    f' `predictor = TextPredictor.load("{self.path}")`')
        self.save(self.path)
        return self