Пример #1
0
def test_no_text_column_raise():
    data = [('😁😁😁😁😁😁', 'grin')] * 2000 + [('😃😃😃😃😃😃😃😃', 'smile')
                                          ] * 1000 + [('😉😉😉', 'wink')] * 1000

    df = pd.DataFrame(data, columns=['data', 'label'])
    with pytest.raises(AssertionError):
        predictor = TextPredictor(label='label', verbosity=4)
        predictor.fit(df, hyperparameters=get_test_hyperparameters(), seed=123)
Пример #2
0
def test_empty_text_item():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    train_data = train_data.iloc[train_perm[:100]]
    train_data.iat[0, 0] = None
    train_data.iat[10, 0] = None
    predictor = TextPredictor(label='score', verbosity=4)
    predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30)
Пример #3
0
def test_no_job_finished_raise():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/train.parquet')
    with pytest.raises(RuntimeError):
        # Setting a very small time limits to trigger the bug
        predictor = TextPredictor(label='label')
        predictor.fit(train_data,
                      hyperparameters=get_test_hyperparameters(),
                      time_limit=1,
                      num_gpus=1,
                      seed=123)
Пример #4
0
def test_cpu_only_warning():
    train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                              'glue/sst/train.parquet')
    dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                            'glue/sst/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label='label', eval_metric='acc')
    with pytest.warns(UserWarning):
        predictor.fit(train_data, hyperparameters=get_test_hyperparameters(),
                      num_gpus=0, seed=123)
Пример #5
0
def test_cpu_only_raise(set_env_train_without_gpu):
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sst/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label='label', eval_metric='acc')
    if set_env_train_without_gpu is None:
        with pytest.raises(RuntimeError):
            predictor.fit(train_data,
                          hyperparameters=get_test_hyperparameters(),
                          num_gpus=0,
                          seed=123)
    elif set_env_train_without_gpu is True:
        os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '1'
        predictor.fit(train_data,
                      hyperparameters=get_test_hyperparameters(),
                      num_gpus=0,
                      time_limit=30,
                      seed=123)
        verify_predictor_save_load(predictor, dev_data, verify_proba=True)
    else:
        with pytest.raises(RuntimeError):
            predictor.fit(train_data,
                          hyperparameters=get_test_hyperparameters(),
                          num_gpus=0,
                          seed=123)
Пример #6
0
def test_sst(hyperparameters):
    train_data = load_pd.load(
        'https://autogluon-text-data.s3-accelerate.amazonaws.com/'
        'glue/sst/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text-data.s3-accelerate.amazonaws.com/'
        'glue/sst/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label='label', eval_metric='acc')
    predictor.fit(train_data, hyperparameters=hyperparameters)
    dev_acc = predictor.evaluate(dev_data, metrics=['acc'])
    verify_predictor_save_load(predictor, dev_data, verify_proba=True)
Пример #7
0
def test_load_old_checkpoint():
    dataset = AmazonReviewSentimentCrossLingualDataset()
    sha1sum_id = "4ba096cdf6bd76c06386f2c27140db055e59c91b"
    checkpoint_name = "mdeberta-v3-base-checkpoint"
    save_path = os.path.join(get_home_dir(), "checkpoints")
    file_path = os.path.join(save_path, f"{checkpoint_name}.zip")
    checkpoint_path = os.path.join(get_home_dir(), "checkpoints",
                                   checkpoint_name)
    if os.path.exists(save_path):
        shutil.rmtree(save_path)
    download(
        url=
        f"s3://automl-mm-bench/unit-tests-0.4/checkpoints/{checkpoint_name}.zip",
        path=file_path,
        sha1_hash=sha1sum_id,
    )
    protected_zip_extraction(
        file_path,
        sha1_hash=sha1sum_id,
        folder=save_path,
    )
    predictor = TextPredictor.load(checkpoint_path)
    verify_predictor_save_load(predictor, dataset.test_df)

    # continuous training
    predictor.fit(
        dataset.train_df,
        presets="multilingual",
        time_limit=10,
        hyperparameters={"optimization.top_k_average_method": "uniform_soup"},
    )
    verify_predictor_save_load(predictor, dataset.test_df)
Пример #8
0
def train(args):
    set_seed(args.seed)
    if args.task is not None:
        feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task]
    else:
        raise NotImplementedError
    if args.exp_dir is None:
        args.exp_dir = 'autogluon_text_{}'.format(args.task)
    train_df = load_pd.load(args.train_file)
    dev_df = load_pd.load(args.dev_file)
    test_df = load_pd.load(args.test_file)
    train_df = train_df[feature_columns + [label_column]]
    dev_df = dev_df[feature_columns + [label_column]]
    test_df = test_df[feature_columns]
    if args.task == 'mrpc' or args.task == 'sts':
        # Augmenting the un-ordered set manually.
        train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]],
                                            feature_columns[1]: train_df[feature_columns[0]],
                                            label_column: train_df[label_column]})
        real_train_df = pd.concat([train_df, train_df_other_part])
        real_dev_df = dev_df
    else:
        real_train_df = train_df
        real_dev_df = dev_df
    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal',
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal')
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      seed=args.seed)
    else:
        raise NotImplementedError
    dev_metric_score = predictor.evaluate(dev_df)
    dev_predictions = predictor.predict(dev_df, as_pandas=True)
    test_predictions = predictor.predict(test_df, as_pandas=True)
    dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv'))
    test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
    with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of:
        json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
Пример #9
0
def predict(args):
    if args.use_tabular:
        predictor = TabularPredictor.load(args.model_dir)
    else:
        predictor = TextPredictor.load(args.model_dir)
    test_prediction = predictor.predict(args.test_file, as_pandas=True)
    if args.exp_dir is None:
        args.exp_dir = '.'
    test_prediction.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
Пример #10
0
def test_emoji():
    data = []
    for i in range(50 * 3):
        data.append(('😁' * (i + 1), 'grin'))

    for i in range(30 * 3):
        data.append(('😃' * (i + 1), 'smile'))

    for i in range(20 * 3):
        data.append(('😉' * (i + 1), 'wink'))
    df = pd.DataFrame(data, columns=['data', 'label'])
    predictor = TextPredictor(label='label', verbosity=3)
    predictor.fit(df,
                  hyperparameters=get_test_hyperparameters(),
                  time_limit=30,
                  seed=123)
    assert set(predictor.class_labels) == {'grin', 'smile', 'wink'}
    assert predictor.class_labels_internal == [0, 1, 2]
    verify_predictor_save_load(predictor, df)
Пример #11
0
def test_predictor_fit(key):
    train_data = load_pd.load(DATA_INFO[key]['train'])
    dev_data = load_pd.load(DATA_INFO[key]['dev'])
    label = DATA_INFO[key]['label']
    eval_metric = DATA_INFO[key]['metric']
    verify_proba = DATA_INFO[key]['verify_proba']

    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label=label, eval_metric=eval_metric)
    predictor.fit(train_data,
                  hyperparameters=get_test_hyperparameters(),
                  time_limit=30,
                  seed=123)
    dev_score = predictor.evaluate(dev_data)
    verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)
Пример #12
0
 def load(cls, path: str, reset_paths=True, verbose=True):
     model = super().load(path=path,
                          reset_paths=reset_paths,
                          verbose=verbose)
     if model._load_model:
         try_import_autogluon_text()
         from autogluon.text import TextPredictor
         model.model = TextPredictor.load(
             os.path.join(path, cls.nn_model_name))
     model._load_model = None
     return model
Пример #13
0
    def load(cls, path: str, reset_paths=True, verbose=True):
        try:
            from autogluon.text import TextPredictor
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        model = super().load(path=path,
                             reset_paths=reset_paths,
                             verbose=verbose)
        model.model = TextPredictor.load(os.path.join(path, cls.nn_model_name))
        return model
Пример #14
0
def verify_predictor_save_load(predictor,
                               df,
                               verify_proba=False,
                               verify_embedding=True):
    with tempfile.TemporaryDirectory() as root:
        predictor.save(root)
        predictions = predictor.predict(df)
        loaded_predictor = TextPredictor.load(root)
        predictions2 = loaded_predictor.predict(df)
        npt.assert_equal(predictions, predictions2)
        if verify_proba:
            predictions_prob = predictor.predict_proba(df)
            predictions2_prob = loaded_predictor.predict_proba(df)
            npt.assert_equal(predictions_prob, predictions2_prob)
        if verify_embedding:
            embeddings = predictor.predict_features(df)
            assert embeddings.shape[0] == len(df)
Пример #15
0
def verify_predictor_save_load(predictor, df, verify_proba=False,
                               verify_embedding=True):
    with tempfile.TemporaryDirectory() as root:
        predictor.save(root)
        predictions = predictor.predict(df, as_pandas=False)
        loaded_predictor = TextPredictor.load(root)
        predictions2 = loaded_predictor.predict(df, as_pandas=False)
        predictions2_df = loaded_predictor.predict(df, as_pandas=True)
        npt.assert_equal(predictions, predictions2)
        npt.assert_equal(predictions2,
                         predictions2_df.to_numpy())
        if verify_proba:
            predictions_prob = predictor.predict_proba(df, as_pandas=False)
            predictions2_prob = loaded_predictor.predict_proba(df, as_pandas=False)
            predictions2_prob_df = loaded_predictor.predict_proba(df, as_pandas=True)
            npt.assert_equal(predictions_prob, predictions2_prob)
            npt.assert_equal(predictions2_prob, predictions2_prob_df.to_numpy())
        if verify_embedding:
            embeddings = predictor.extract_embedding(df)
            assert embeddings.shape[0] == len(df)
Пример #16
0
def test_standalone_with_emoji():
    import tempfile
    from unittest import mock

    requests_gag = mock.patch(
        'requests.Session.request',
        mock.Mock(side_effect=RuntimeError(
            'Please use the `responses` library to mock HTTP in your tests.'
        ))
    )

    data = []
    for i in range(50 * 3):
        data.append(('😁' * (i + 1), 'grin'))

    for i in range(30 * 3):
        data.append(('😃' * (i + 1), 'smile'))

    for i in range(20 * 3):
        data.append(('😉' * (i + 1), 'wink'))
    df = pd.DataFrame(data, columns=['data', 'label'])
    predictor = TextPredictor(label='label', verbosity=3)
    predictor.fit(
        df,
        hyperparameters=get_test_hyperparameters(),
        time_limit=5,
        seed=123,
    )

    predictions1 = predictor.predict(df, as_pandas=False)
    with tempfile.TemporaryDirectory() as root:
        predictor.save(root, standalone=True)
        with requests_gag:  # no internet connections
            offline_predictor = TextPredictor.load(root)
            predictions2 = offline_predictor.predict(df, as_pandas=False)

    npt.assert_equal(predictions1, predictions2)
Пример #17
0
def main(args):
    tasks = {
        'cola': ['CoLA.tsv', 'glue/cola/test.tsv'],
        'sst': ['SST-2.tsv', 'glue/sst/test.tsv'],
        'mrpc': ['MRPC.tsv', 'glue/mrpc/test.tsv'],
        'sts': ['STS-B.tsv', 'glue/sts/test.tsv'],
        'qqp': ['QQP.tsv', 'glue/qqp/test.tsv'],
        'mnli_m': ['MNLI-m.tsv', 'glue/mnli/test_matched.tsv'],
        'mnli_mm': ['MNLI-mm.tsv', 'glue/mnli/test_mismatched.tsv'],
        'qnli': ['QNLI.tsv', 'glue/qnli/test.tsv'],
        'rte': ['RTE.tsv', 'glue/rte/test.tsv'],
        'wnli': ['WNLI.tsv', 'glue/wnli/test.tsv'],
        'ax': ['AX.tsv', 'glue/rte_diagnostic/diagnostic.tsv']
    }

    os.makedirs(args.save_dir, exist_ok=True)

    for task, (save_name, test_file_path) in tasks.items():
        if task == 'ax':
            # For AX, we need to load the mnli-m checkpoint and run inference
            test_df = pd.read_csv(test_file_path, sep='\t', header=0)
            test_index = test_df['index']
            predictor = TextPredictor.load(f'{args.prefix}_mnli_m')
            label_column = predictor.label
            predictions = predictor.predict(test_df)
        else:
            test_index = get_test_index(test_file_path)
            prediction_df = pd.read_csv(
                f'{args.prefix}_{task}/test_prediction.csv', index_col=0)
            label_column = prediction_df.columns[0]
            predictions = prediction_df[label_column]
        if task == 'sts':
            predictions = np.clip(predictions, 0, 5)
        with open(os.path.join(args.save_dir, save_name), 'w') as of:
            of.write('index\t{}\n'.format(label_column))
            for i in range(len(predictions)):
                of.write('{}\t{}\n'.format(test_index[i], predictions[i]))
def extract_pretrained_embedding(dataset):
    hyperparameters = ag_text_presets.create('default')
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'model.num_trainable_layers'] = 0
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'model._disable_update'] = True
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'optimization.num_train_epochs'] = 1
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'preprocessing.categorical.convert_to_text'] = True
    hyperparameters['models']['MultimodalTextModel']['search_space']['optimization.lr'] = 0.
    seed = 123
    train_dataset = dataset_registry.create(dataset, 'train')
    test_dataset = dataset_registry.create(dataset, 'test')
    train_data1, tuning_data1 = sklearn.model_selection.train_test_split(
        train_dataset.data,
        test_size=0.05,
        random_state=np.random.RandomState(seed))
    column_types, inferred_problem_type = infer_column_problem_types(train_data1,
                                                                     tuning_data1,
                                                                     label_columns=train_dataset.label_columns,
                                                                     problem_type=train_dataset.problem_type)
    text_feature_columns = [col_name for col_name in train_dataset.feature_columns if
                            column_types[col_name] == 'text']
    train_text_only_data = train_dataset.data[text_feature_columns + train_dataset.label_columns]
    test_text_only_data = test_dataset.data[text_feature_columns + test_dataset.label_columns]
    sampled_train_data = train_text_only_data.sample(10)
    predictor = TextPredictor(label=train_dataset.label_columns)
    predictor.fit(train_data=sampled_train_data,
                  column_types=column_types,
                  hyperparameters=hyperparameters)
    train_features = predictor.extract_embedding(train_text_only_data)
    test_features = predictor.extract_embedding(test_text_only_data)
    save_base_dir = f'embeddings/{dataset}/pretrain_text_embedding'
    os.makedirs(save_base_dir, exist_ok=True)
    np.save(os.path.join(save_base_dir, 'train.npy'), train_features)
    np.save(os.path.join(save_base_dir, 'test.npy'), test_features)
    with open(os.path.join(save_base_dir, 'text_columns.json'), 'w') as in_f:
        json.dump(text_feature_columns, in_f)
Пример #19
0
def test_predictor_fit(key):
    train_data = load_pd.load(DATA_INFO[key]['train'])
    dev_data = load_pd.load(DATA_INFO[key]['dev'])
    label = DATA_INFO[key]['label']
    eval_metric = DATA_INFO[key]['metric']
    verify_proba = DATA_INFO[key]['verify_proba']

    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = TextPredictor(label=label, eval_metric=eval_metric)
    predictor.fit(train_data,
                  hyperparameters=get_test_hyperparameters(),
                  time_limit=30,
                  seed=123)
    dev_score = predictor.evaluate(dev_data)
    verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)

    # Test for continuous fit
    predictor.fit(train_data,
                  hyperparameters=get_test_hyperparameters(),
                  time_limit=30,
                  seed=123)
    verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)

    # Saving to folder, loading the saved model and call fit again (continuous fit)
    with tempfile.TemporaryDirectory() as root:
        predictor.save(root)
        predictor = TextPredictor.load(root)
        predictor.fit(train_data,
                      hyperparameters=get_test_hyperparameters(),
                      time_limit=30,
                      seed=123)
Пример #20
0
def test_mixed_column_type():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sts/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/'
        'glue/sts/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:1000]]
    dev_data = dev_data.iloc[valid_perm[:10]]

    # Add more columns as feature
    train_data = pd.DataFrame({
        'sentence1': train_data['sentence1'],
        'sentence2': train_data['sentence2'],
        'sentence3': train_data['sentence2'],
        'categorical0': train_data['genre'],
        'numerical0': train_data['score'],
        'genre': train_data['genre'],
        'score': train_data['score']
    })
    dev_data = pd.DataFrame({
        'sentence1': dev_data['sentence1'],
        'sentence2': dev_data['sentence2'],
        'sentence3': dev_data['sentence2'],
        'categorical0': dev_data['genre'],
        'numerical0': dev_data['score'],
        'genre': dev_data['genre'],
        'score': dev_data['score']
    })
    # Train Regression
    predictor1 = TextPredictor(label='score', verbosity=4)
    predictor1.fit(train_data,
                   hyperparameters=get_test_hyperparameters(),
                   time_limit=30,
                   seed=123)

    dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse'])
    verify_predictor_save_load(predictor1, dev_data)

    # Train Classification
    predictor2 = TextPredictor(label='genre', verbosity=4)
    predictor2.fit(train_data,
                   hyperparameters=get_test_hyperparameters(),
                   time_limit=30,
                   seed=123)

    dev_rmse = predictor2.evaluate(dev_data, metrics=['acc'])
    verify_predictor_save_load(predictor2, dev_data, verify_proba=True)

    # Specify the feature column
    predictor3 = TextPredictor(label='score', verbosity=4)
    predictor3.fit(
        train_data[['sentence1', 'sentence3', 'categorical0', 'score']],
        hyperparameters=get_test_hyperparameters(),
        time_limit=30,
        seed=123)
    dev_rmse = predictor3.evaluate(dev_data, metrics=['rmse'])
    verify_predictor_save_load(predictor3, dev_data)
def train_model(dataset_name,
                text_presets,
                save_dir,
                model,
                tabular_presets,
                num_gpus=None,
                get_competition_results=False,
                seed=123):
    set_seed(seed)
    if get_competition_results:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'competition')
    else:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'test')
    feature_columns = train_dataset.feature_columns
    label_columns = train_dataset.label_columns
    metric = train_dataset.metric
    problem_type = train_dataset.problem_type
    train_data1, tuning_data1 = sklearn.model_selection.train_test_split(
        train_dataset.data,
        test_size=0.05,
        random_state=np.random.RandomState(seed))
    train_data = train_dataset.data
    test_data = test_dataset.data
    column_types, inferred_problem_type = infer_column_problem_types(
        train_data1,
        tuning_data1,
        label_columns=label_columns,
        problem_type=problem_type)
    train_data = train_data[feature_columns + label_columns]
    # tuning_data = tuning_data[feature_columns + label_columns]
    if not get_competition_results:
        test_data = test_data[feature_columns + label_columns]
    train_tic = time.time()
    if model == 'ag_tabular_quick':
        MAX_NGRAM = 300
        feature_generator = AutoMLPipelineFeatureGenerator(
            vectorizer=CountVectorizer(min_df=30,
                                       ngram_range=(1, 3),
                                       max_features=MAX_NGRAM,
                                       dtype=np.uint8))
        predictor = TabularPredictor(label=label_columns[0],
                                     path=save_dir,
                                     problem_type=problem_type)
        predictor.fit(train_data,
                      time_limit=30,
                      feature_generator=feature_generator)
    elif model == 'ag_tabular_without_text':
        no_text_feature_columns = []
        for col_name in feature_columns:
            if column_types[col_name] != _TEXT:
                no_text_feature_columns.append(col_name)
        train_data = train_data[no_text_feature_columns + label_columns]
        # tuning_data = tuning_data[no_text_feature_columns + label_columns]
        test_data = test_data[no_text_feature_columns + label_columns]
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets in ['best_quality']:
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          presets=tabular_presets)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          num_bag_folds=5,
                          num_stack_levels=1)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_tabular_old':
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_text_only':
        text_feature_columns = [
            col_name for col_name in feature_columns
            if column_types[col_name] == _TEXT
        ]
        train_data = train_data[text_feature_columns + label_columns]
        test_data = test_data[text_feature_columns + label_columns]
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'ag_text_multimodal':
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text':
        feature_generator = AutoMLPipelineFeatureGenerator(
            enable_text_special_features=False,
            enable_text_ngram_features=False)
        pre_embedding_folder = os.path.join(_CURR_DIR,
                                            'pre_computed_embeddings')
        if model == 'pre_embedding':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'test.npy'))
        elif model == 'tune_embedding_multimodal':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'test.npy'))
        elif model == 'tune_embedding_text':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'test.npy'))
        else:
            raise NotImplementedError
        train_data = train_data.join(
            pd.DataFrame(train_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(train_features.shape[1])
                         ]))
        train_data.reset_index(drop=True, inplace=True)
        test_data = test_data.join(
            pd.DataFrame(test_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(test_features.shape[1])
                         ]))
        test_data.reset_index(drop=True, inplace=True)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError

    elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table':
        if model == 'tabular_multimodal':
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True)
            hyperparameters = get_multimodal_tabular_hparam_just_gbm(
                text_presets=text_presets)
        else:
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True,
                enable_text_special_features=False,
                enable_text_ngram_features=False)
            hyperparameters = multimodal_tabular_just_table_hparam(
                text_presets=text_presets)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '3fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=3,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
    train_toc = time.time()
    inference_tic = time.time()
    predictions = predictor.predict(test_data, as_pandas=True)
    predictor.save()
    inference_toc = time.time()
    if problem_type == MULTICLASS or problem_type == BINARY:
        prediction_prob = predictor.predict_proba(test_data, as_pandas=True)
        prediction_prob.to_csv(
            os.path.join(save_dir, 'test_prediction_prob.csv'))
    predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv'))
    gt = test_data[label_columns[0]]
    gt.to_csv(os.path.join(save_dir, 'ground_truth.csv'))
    if not get_competition_results:
        score = predictor.evaluate(test_data)
        with open(os.path.join(save_dir, 'test_score.json'), 'w') as of:
            json.dump({metric: score}, of)
    with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of:
        json.dump(
            {
                'train_time': train_toc - train_tic,
                'inference_time': inference_toc - inference_tic,
                'cpuinfo': cpuinfo.get_cpu_info()
            }, of)
Пример #22
0
class TextPredictionV1Model(AbstractModel):
    nn_model_name = 'text_nn'

    def __init__(self, **kwargs):
        """The TextPredictionV1Model.

        The features can be a mix of
        - text column
        - categorical column
        - numerical column

        The labels can be categorical or numerical.

        Parameters
        ----------
        path
            The directory to store the modeling outputs.
        name
            Name of subdirectory inside path where model will be saved.
        problem_type
            Type of problem that this model will handle.
            Valid options: ['binary', 'multiclass', 'regression'].
        eval_metric
            The evaluation metric.
        num_classes
            The number of classes.
        stopping_metric
            The stopping metric.
        model
            The internal model object.
        hyperparameters
            The hyperparameters of the model
        features
            Names of the features.
        feature_metadata
            The feature metadata.
        debug
            Whether to turn on debug mode
        """
        super().__init__(**kwargs)
        self._label_column_name = None

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(get_features_kwargs=dict(
            valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT],
            invalid_special_types=[
                S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL
            ],
        ), )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args(cls) -> dict:
        default_ag_args = super()._get_default_ag_args()
        extra_ag_args = {'valid_stacker': False}
        default_ag_args.update(extra_ag_args)
        return default_ag_args

    def _set_default_params(self):
        try:
            from autogluon.text import ag_text_presets
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)
        super()._set_default_params()
        self.params = ag_text_presets.create('default')

    def _fit(self,
             X_train: pd.DataFrame,
             y_train: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X_train
            Features of the training dataset
        y_train
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try:
            import mxnet as mx
            from autogluon.text import TextPredictor
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        # Decide name of the label column
        if 'label' in X_train.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X_train.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X_train, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)

        self.model = TextPredictor(label=self._label_column_name,
                                   problem_type=self.problem_type,
                                   path=self.path,
                                   eval_metric=self.eval_metric,
                                   verbosity=verbosity)
        X_train.insert(len(X_train.columns), self._label_column_name, y_train)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        assert self.params['hpo_params']['num_trials'] == 1 \
               or self.params['hpo_params']['num_trials'] is None
        params = copy.deepcopy(self.params)
        params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size']\
            = max(1,
                  params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size'] // 2)
        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       num_gpus=num_gpus,
                       num_cpus=num_cpus,
                       hyperparameters=params,
                       seed=params.get('seed'))

    def save(self, path: str = None, verbose=True) -> str:
        model = self.model
        self.model = None
        # save this AbstractModel object without NN weights
        path = super().save(path=path, verbose=verbose)
        self.model = model

        text_nn_path = os.path.join(path, self.nn_model_name)
        model.save(text_nn_path)
        logger.log(
            15,
            f"\tSaved Text NN weights and model hyperparameters to '{text_nn_path}'."
        )

        return path

    @classmethod
    def load(cls, path: str, reset_paths=True, verbose=True):
        try:
            from autogluon.text import TextPredictor
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        model = super().load(path=path,
                             reset_paths=reset_paths,
                             verbose=verbose)
        model.model = TextPredictor.load(os.path.join(path, cls.nn_model_name))
        return model

    def get_memory_size(self) -> int:
        """Return the memory size by calculating the total number of parameters.

        Returns
        -------
        memory_size
            The total memory size in bytes.
        """
        total_size = 0
        for k, v in self.model._model.net.collect_params().items():
            total_size += np.dtype(v.dtype).itemsize * np.prod(v.shape)
        return total_size

    def _get_default_resources(self):
        num_cpus = get_cpu_count()
        num_gpus = get_gpu_count()
        return num_cpus, num_gpus
Пример #23
0
    def _fit(self,
             X_train: pd.DataFrame,
             y_train: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X_train
            Features of the training dataset
        y_train
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try:
            import mxnet as mx
            from autogluon.text import TextPredictor
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        # Decide name of the label column
        if 'label' in X_train.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X_train.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X_train, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)

        self.model = TextPredictor(label=self._label_column_name,
                                   problem_type=self.problem_type,
                                   path=self.path,
                                   eval_metric=self.eval_metric,
                                   verbosity=verbosity)
        X_train.insert(len(X_train.columns), self._label_column_name, y_train)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        assert self.params['hpo_params']['num_trials'] == 1 \
               or self.params['hpo_params']['num_trials'] is None
        params = copy.deepcopy(self.params)
        params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size']\
            = max(1,
                  params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size'] // 2)
        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       num_gpus=num_gpus,
                       num_cpus=num_cpus,
                       hyperparameters=params,
                       seed=params.get('seed'))
Пример #24
0
class TextPredictorModel(AbstractModel):
    nn_model_name = 'text_nn'

    def __init__(self, **kwargs):
        """Wrapper of autogluon.text.TextPredictor.

        The features can be a mix of
        - text column
        - categorical column
        - numerical column

        The labels can be categorical or numerical.

        Parameters
        ----------
        path
            The directory to store the modeling outputs.
        name
            Name of subdirectory inside path where model will be saved.
        problem_type
            Type of problem that this model will handle.
            Valid options: ['binary', 'multiclass', 'regression'].
        eval_metric
            The evaluation metric.
        num_classes
            The number of classes.
        stopping_metric
            The stopping metric.
        model
            The internal model object.
        hyperparameters
            The hyperparameters of the model
        features
            Names of the features.
        feature_metadata
            The feature metadata.
        """
        super().__init__(**kwargs)
        self._label_column_name = None
        self._load_model = None  # Whether to load inner model when loading.

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(get_features_kwargs=dict(
            valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT],
            invalid_special_types=[
                S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL
            ],
        ), )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args(cls) -> dict:
        default_ag_args = super()._get_default_ag_args()
        extra_ag_args = {'valid_stacker': False}
        default_ag_args.update(extra_ag_args)
        return default_ag_args

    def _set_default_params(self):
        super()._set_default_params()
        try_import_autogluon_text()
        from autogluon.text import ag_text_presets
        self.params = ag_text_presets.create('default')

    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try_import_mxnet()
        try_import_autogluon_text()
        from autogluon.text import TextPredictor

        # Decide name of the label column
        if 'label' in X.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for TextPredictorModel, this model will ignore them in training."
            )

        X_train.insert(len(X_train.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        assert self.params['tune_kwargs']['num_trials'] == 1 \
               or self.params['tune_kwargs']['num_trials'] is None,\
            'Currently, you cannot nest the hyperparameter search in text neural network ' \
            'and the AutoGluon Tabular.'

        verbosity_text = max(0, verbosity - 1)
        root_logger = logging.getLogger()
        root_log_level = root_logger.level
        self.model = TextPredictor(label=self._label_column_name,
                                   problem_type=self.problem_type,
                                   path=self.path,
                                   eval_metric=self.eval_metric,
                                   verbosity=verbosity_text)
        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       num_gpus=num_gpus,
                       num_cpus=num_cpus,
                       hyperparameters=self.params,
                       seed=self.params.get('seed', 0))
        self.model.set_verbosity(verbosity)
        root_logger.setLevel(root_log_level)  # Reset log level

    def save(self, path: str = None, verbose=True) -> str:
        self._load_model = self.model is not None
        __model = self.model
        self.model = None
        # save this AbstractModel object without NN weights
        path = super().save(path=path, verbose=verbose)
        self.model = __model

        if self._load_model:
            text_nn_path = os.path.join(path, self.nn_model_name)
            self.model.save(text_nn_path)
            logger.log(
                15,
                f"\tSaved Text NN weights and model hyperparameters to '{text_nn_path}'."
            )
        self._load_model = None
        return path

    @classmethod
    def load(cls, path: str, reset_paths=True, verbose=True):
        model = super().load(path=path,
                             reset_paths=reset_paths,
                             verbose=verbose)
        if model._load_model:
            try_import_autogluon_text()
            from autogluon.text import TextPredictor
            model.model = TextPredictor.load(
                os.path.join(path, cls.nn_model_name))
        model._load_model = None
        return model

    def get_memory_size(self) -> int:
        """Return the memory size by calculating the total number of parameters.

        Returns
        -------
        memory_size
            The total memory size in bytes.
        """
        total_size = 0
        for k, v in self.model._model.net.collect_params().items():
            total_size += np.dtype(v.dtype).itemsize * np.prod(v.shape)
        return total_size

    def _get_default_resources(self):
        num_cpus = get_cpu_count()
        num_gpus = get_gpu_count()
        return num_cpus, num_gpus

    def _predict_proba(self, X, **kwargs):
        X = self.preprocess(X, **kwargs)

        if self.problem_type == REGRESSION:
            return self.model.predict(X, as_pandas=False)

        y_pred_proba = self.model.predict_proba(X, as_pandas=False)
        return self._convert_proba_to_unified_form(y_pred_proba)
Пример #25
0
    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try_import_mxnet()
        try_import_autogluon_text()
        from autogluon.text import TextPredictor

        # Decide name of the label column
        if 'label' in X.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for TextPredictorModel, this model will ignore them in training."
            )

        X_train.insert(len(X_train.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        assert self.params['tune_kwargs']['num_trials'] == 1 \
               or self.params['tune_kwargs']['num_trials'] is None,\
            'Currently, you cannot nest the hyperparameter search in text neural network ' \
            'and the AutoGluon Tabular.'

        verbosity_text = max(0, verbosity - 1)
        root_logger = logging.getLogger()
        root_log_level = root_logger.level
        self.model = TextPredictor(label=self._label_column_name,
                                   problem_type=self.problem_type,
                                   path=self.path,
                                   eval_metric=self.eval_metric,
                                   verbosity=verbosity_text)
        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       num_gpus=num_gpus,
                       num_cpus=num_cpus,
                       hyperparameters=self.params,
                       seed=self.params.get('seed', 0))
        self.model.set_verbosity(verbosity)
        root_logger.setLevel(root_log_level)  # Reset log level
Пример #26
0
def run(args):
    if args.task == 'product_sentiment':
        train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file,
                                                                              args.test_file)
    elif args.task == 'mercari_price':
        train_df, test_df, label_column = load_mercari_price_prediction(args.train_file,
                                                                        args.test_file)
    elif args.task == 'price_of_books':
        train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file)
    elif args.task == 'data_scientist_salary':
        train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file)
    else:
        raise NotImplementedError

    hyperparameters = get_hyperparameter_config('multimodal')
    if args.preset is not None and args.mode in ['stacking', 'weighted']:
        hyperparameters['AG_TEXT_NN']['presets'] = args.preset

    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters,
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters)
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=args.eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      presets=args.preset,
                      seed=args.seed)
    else:
        raise NotImplementedError
    if args.task == 'product_sentiment':
        test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True)
        test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    elif args.task == 'data_scientist_salary':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = predictions
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'price_of_books':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = np.power(10, predictions) - 1
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'mercari_price':
        test_predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_csv(args.sample_submission)
        submission.loc[:, label_column] = np.exp(test_predictions) - 1
        submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    else:
        raise NotImplementedError
    predictor = TabularPredictor(path=os.path.join(args.save_dir,
                                                   args.model_type, time_str),
                                 problem_type=train_dataset.problem_type,
                                 eval_metric=train_dataset.metric,
                                 label=label_columns[0])
    if args.ensemble_type == 'weighted':
        predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                      feature_generator=feature_generator,
                      hyperparameters=tabular_hparams)
    else:
        predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                      feature_generator=feature_generator,
                      num_bag_folds=5,
                      num_stack_levels=1,
                      hyperparameters=tabular_hparams)
    predictor.save()
else:
    predictor = TextPredictor(path=os.path.join(args.save_dir, args.model_type,
                                                time_str),
                              problem_type=train_dataset.problem_type,
                              eval_metric=train_dataset.metric,
                              label=label_columns[0])
    predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                  presets='electra_base_late_fusion_concate_e10_avg3')
    predictor.save(
        os.path.join(args.save_dir, args.model_type, time_str,
                     'text_prediction'))
predictions = predictor.predict(competition_df, as_pandas=True)
predictions.to_csv(
    os.path.join(args.save_dir, args.model_type, time_str, 'pred.csv'))
Пример #28
0
def test_distillation():
    train_data = load_pd.load(
        "https://autogluon-text.s3-accelerate.amazonaws.com/"
        "glue/sst/train.parquet")
    test_data = load_pd.load(
        "https://autogluon-text.s3-accelerate.amazonaws.com/"
        "glue/sst/dev.parquet")
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    test_perm = rng_state.permutation(len(test_data))
    train_data = train_data.iloc[train_perm[:100]]
    test_data = test_data.iloc[test_perm[:10]]

    teacher_predictor = TextPredictor(label="label", eval_metric="acc")

    hyperparameters = {
        "model.hf_text.checkpoint_name": "prajjwal1/bert-tiny",
        "env.num_workers": 0,
        "env.num_workers_evaluation": 0,
    }

    teacher_save_path = os.path.join("sst", "teacher")
    if os.path.exists(teacher_save_path):
        shutil.rmtree(teacher_save_path)

    teacher_predictor = teacher_predictor.fit(
        train_data=train_data,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=teacher_save_path,
    )

    # test for distillation
    predictor = TextPredictor(label="label", eval_metric="acc")

    student_save_path = os.path.join("sst", "student")
    if os.path.exists(student_save_path):
        shutil.rmtree(student_save_path)

    predictor = predictor.fit(
        train_data=train_data,
        teacher_predictor=teacher_predictor,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=student_save_path,
    )
    verify_predictor_save_load(predictor, test_data)

    # test for distillation with teacher predictor path
    predictor = TextPredictor(label="label", eval_metric="acc")

    student_save_path = os.path.join("sst", "student")
    if os.path.exists(student_save_path):
        shutil.rmtree(student_save_path)

    predictor = predictor.fit(
        train_data=train_data,
        teacher_predictor=teacher_predictor.path,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=student_save_path,
    )
    verify_predictor_save_load(predictor, test_data)