def test_no_text_column_raise(): data = [('😁😁😁😁😁😁', 'grin')] * 2000 + [('😃😃😃😃😃😃😃😃', 'smile') ] * 1000 + [('😉😉😉', 'wink')] * 1000 df = pd.DataFrame(data, columns=['data', 'label']) with pytest.raises(AssertionError): predictor = TextPredictor(label='label', verbosity=4) predictor.fit(df, hyperparameters=get_test_hyperparameters(), seed=123)
def test_empty_text_item(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) train_data = train_data.iloc[train_perm[:100]] train_data.iat[0, 0] = None train_data.iat[10, 0] = None predictor = TextPredictor(label='score', verbosity=4) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30)
def test_no_job_finished_raise(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') with pytest.raises(RuntimeError): # Setting a very small time limits to trigger the bug predictor = TextPredictor(label='label') predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=1, num_gpus=1, seed=123)
def test_cpu_only_warning(): train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label='label', eval_metric='acc') with pytest.warns(UserWarning): predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, seed=123)
def test_cpu_only_raise(set_env_train_without_gpu): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label='label', eval_metric='acc') if set_env_train_without_gpu is None: with pytest.raises(RuntimeError): predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, seed=123) elif set_env_train_without_gpu is True: os.environ['AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU'] = '1' predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, time_limit=30, seed=123) verify_predictor_save_load(predictor, dev_data, verify_proba=True) else: with pytest.raises(RuntimeError): predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), num_gpus=0, seed=123)
def test_sst(hyperparameters): train_data = load_pd.load( 'https://autogluon-text-data.s3-accelerate.amazonaws.com/' 'glue/sst/train.parquet') dev_data = load_pd.load( 'https://autogluon-text-data.s3-accelerate.amazonaws.com/' 'glue/sst/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label='label', eval_metric='acc') predictor.fit(train_data, hyperparameters=hyperparameters) dev_acc = predictor.evaluate(dev_data, metrics=['acc']) verify_predictor_save_load(predictor, dev_data, verify_proba=True)
def test_load_old_checkpoint(): dataset = AmazonReviewSentimentCrossLingualDataset() sha1sum_id = "4ba096cdf6bd76c06386f2c27140db055e59c91b" checkpoint_name = "mdeberta-v3-base-checkpoint" save_path = os.path.join(get_home_dir(), "checkpoints") file_path = os.path.join(save_path, f"{checkpoint_name}.zip") checkpoint_path = os.path.join(get_home_dir(), "checkpoints", checkpoint_name) if os.path.exists(save_path): shutil.rmtree(save_path) download( url= f"s3://automl-mm-bench/unit-tests-0.4/checkpoints/{checkpoint_name}.zip", path=file_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( file_path, sha1_hash=sha1sum_id, folder=save_path, ) predictor = TextPredictor.load(checkpoint_path) verify_predictor_save_load(predictor, dataset.test_df) # continuous training predictor.fit( dataset.train_df, presets="multilingual", time_limit=10, hyperparameters={"optimization.top_k_average_method": "uniform_soup"}, ) verify_predictor_save_load(predictor, dataset.test_df)
def train(args): set_seed(args.seed) if args.task is not None: feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task] else: raise NotImplementedError if args.exp_dir is None: args.exp_dir = 'autogluon_text_{}'.format(args.task) train_df = load_pd.load(args.train_file) dev_df = load_pd.load(args.dev_file) test_df = load_pd.load(args.test_file) train_df = train_df[feature_columns + [label_column]] dev_df = dev_df[feature_columns + [label_column]] test_df = test_df[feature_columns] if args.task == 'mrpc' or args.task == 'sts': # Augmenting the un-ordered set manually. train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]], feature_columns[1]: train_df[feature_columns[0]], label_column: train_df[label_column]}) real_train_df = pd.concat([train_df, train_df_other_part]) real_dev_df = dev_df else: real_train_df = train_df real_dev_df = dev_df if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal') elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, seed=args.seed) else: raise NotImplementedError dev_metric_score = predictor.evaluate(dev_df) dev_predictions = predictor.predict(dev_df, as_pandas=True) test_predictions = predictor.predict(test_df, as_pandas=True) dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv')) test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv')) with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of: json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
def predict(args): if args.use_tabular: predictor = TabularPredictor.load(args.model_dir) else: predictor = TextPredictor.load(args.model_dir) test_prediction = predictor.predict(args.test_file, as_pandas=True) if args.exp_dir is None: args.exp_dir = '.' test_prediction.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
def test_emoji(): data = [] for i in range(50 * 3): data.append(('😁' * (i + 1), 'grin')) for i in range(30 * 3): data.append(('😃' * (i + 1), 'smile')) for i in range(20 * 3): data.append(('😉' * (i + 1), 'wink')) df = pd.DataFrame(data, columns=['data', 'label']) predictor = TextPredictor(label='label', verbosity=3) predictor.fit(df, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) assert set(predictor.class_labels) == {'grin', 'smile', 'wink'} assert predictor.class_labels_internal == [0, 1, 2] verify_predictor_save_load(predictor, df)
def test_predictor_fit(key): train_data = load_pd.load(DATA_INFO[key]['train']) dev_data = load_pd.load(DATA_INFO[key]['dev']) label = DATA_INFO[key]['label'] eval_metric = DATA_INFO[key]['metric'] verify_proba = DATA_INFO[key]['verify_proba'] rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label=label, eval_metric=eval_metric) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_score = predictor.evaluate(dev_data) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba)
def load(cls, path: str, reset_paths=True, verbose=True): model = super().load(path=path, reset_paths=reset_paths, verbose=verbose) if model._load_model: try_import_autogluon_text() from autogluon.text import TextPredictor model.model = TextPredictor.load( os.path.join(path, cls.nn_model_name)) model._load_model = None return model
def load(cls, path: str, reset_paths=True, verbose=True): try: from autogluon.text import TextPredictor except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) model = super().load(path=path, reset_paths=reset_paths, verbose=verbose) model.model = TextPredictor.load(os.path.join(path, cls.nn_model_name)) return model
def verify_predictor_save_load(predictor, df, verify_proba=False, verify_embedding=True): with tempfile.TemporaryDirectory() as root: predictor.save(root) predictions = predictor.predict(df) loaded_predictor = TextPredictor.load(root) predictions2 = loaded_predictor.predict(df) npt.assert_equal(predictions, predictions2) if verify_proba: predictions_prob = predictor.predict_proba(df) predictions2_prob = loaded_predictor.predict_proba(df) npt.assert_equal(predictions_prob, predictions2_prob) if verify_embedding: embeddings = predictor.predict_features(df) assert embeddings.shape[0] == len(df)
def verify_predictor_save_load(predictor, df, verify_proba=False, verify_embedding=True): with tempfile.TemporaryDirectory() as root: predictor.save(root) predictions = predictor.predict(df, as_pandas=False) loaded_predictor = TextPredictor.load(root) predictions2 = loaded_predictor.predict(df, as_pandas=False) predictions2_df = loaded_predictor.predict(df, as_pandas=True) npt.assert_equal(predictions, predictions2) npt.assert_equal(predictions2, predictions2_df.to_numpy()) if verify_proba: predictions_prob = predictor.predict_proba(df, as_pandas=False) predictions2_prob = loaded_predictor.predict_proba(df, as_pandas=False) predictions2_prob_df = loaded_predictor.predict_proba(df, as_pandas=True) npt.assert_equal(predictions_prob, predictions2_prob) npt.assert_equal(predictions2_prob, predictions2_prob_df.to_numpy()) if verify_embedding: embeddings = predictor.extract_embedding(df) assert embeddings.shape[0] == len(df)
def test_standalone_with_emoji(): import tempfile from unittest import mock requests_gag = mock.patch( 'requests.Session.request', mock.Mock(side_effect=RuntimeError( 'Please use the `responses` library to mock HTTP in your tests.' )) ) data = [] for i in range(50 * 3): data.append(('😁' * (i + 1), 'grin')) for i in range(30 * 3): data.append(('😃' * (i + 1), 'smile')) for i in range(20 * 3): data.append(('😉' * (i + 1), 'wink')) df = pd.DataFrame(data, columns=['data', 'label']) predictor = TextPredictor(label='label', verbosity=3) predictor.fit( df, hyperparameters=get_test_hyperparameters(), time_limit=5, seed=123, ) predictions1 = predictor.predict(df, as_pandas=False) with tempfile.TemporaryDirectory() as root: predictor.save(root, standalone=True) with requests_gag: # no internet connections offline_predictor = TextPredictor.load(root) predictions2 = offline_predictor.predict(df, as_pandas=False) npt.assert_equal(predictions1, predictions2)
def main(args): tasks = { 'cola': ['CoLA.tsv', 'glue/cola/test.tsv'], 'sst': ['SST-2.tsv', 'glue/sst/test.tsv'], 'mrpc': ['MRPC.tsv', 'glue/mrpc/test.tsv'], 'sts': ['STS-B.tsv', 'glue/sts/test.tsv'], 'qqp': ['QQP.tsv', 'glue/qqp/test.tsv'], 'mnli_m': ['MNLI-m.tsv', 'glue/mnli/test_matched.tsv'], 'mnli_mm': ['MNLI-mm.tsv', 'glue/mnli/test_mismatched.tsv'], 'qnli': ['QNLI.tsv', 'glue/qnli/test.tsv'], 'rte': ['RTE.tsv', 'glue/rte/test.tsv'], 'wnli': ['WNLI.tsv', 'glue/wnli/test.tsv'], 'ax': ['AX.tsv', 'glue/rte_diagnostic/diagnostic.tsv'] } os.makedirs(args.save_dir, exist_ok=True) for task, (save_name, test_file_path) in tasks.items(): if task == 'ax': # For AX, we need to load the mnli-m checkpoint and run inference test_df = pd.read_csv(test_file_path, sep='\t', header=0) test_index = test_df['index'] predictor = TextPredictor.load(f'{args.prefix}_mnli_m') label_column = predictor.label predictions = predictor.predict(test_df) else: test_index = get_test_index(test_file_path) prediction_df = pd.read_csv( f'{args.prefix}_{task}/test_prediction.csv', index_col=0) label_column = prediction_df.columns[0] predictions = prediction_df[label_column] if task == 'sts': predictions = np.clip(predictions, 0, 5) with open(os.path.join(args.save_dir, save_name), 'w') as of: of.write('index\t{}\n'.format(label_column)) for i in range(len(predictions)): of.write('{}\t{}\n'.format(test_index[i], predictions[i]))
def extract_pretrained_embedding(dataset): hyperparameters = ag_text_presets.create('default') hyperparameters['models']['MultimodalTextModel']['search_space'][ 'model.num_trainable_layers'] = 0 hyperparameters['models']['MultimodalTextModel']['search_space'][ 'model._disable_update'] = True hyperparameters['models']['MultimodalTextModel']['search_space'][ 'optimization.num_train_epochs'] = 1 hyperparameters['models']['MultimodalTextModel']['search_space'][ 'preprocessing.categorical.convert_to_text'] = True hyperparameters['models']['MultimodalTextModel']['search_space']['optimization.lr'] = 0. seed = 123 train_dataset = dataset_registry.create(dataset, 'train') test_dataset = dataset_registry.create(dataset, 'test') train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) column_types, inferred_problem_type = infer_column_problem_types(train_data1, tuning_data1, label_columns=train_dataset.label_columns, problem_type=train_dataset.problem_type) text_feature_columns = [col_name for col_name in train_dataset.feature_columns if column_types[col_name] == 'text'] train_text_only_data = train_dataset.data[text_feature_columns + train_dataset.label_columns] test_text_only_data = test_dataset.data[text_feature_columns + test_dataset.label_columns] sampled_train_data = train_text_only_data.sample(10) predictor = TextPredictor(label=train_dataset.label_columns) predictor.fit(train_data=sampled_train_data, column_types=column_types, hyperparameters=hyperparameters) train_features = predictor.extract_embedding(train_text_only_data) test_features = predictor.extract_embedding(test_text_only_data) save_base_dir = f'embeddings/{dataset}/pretrain_text_embedding' os.makedirs(save_base_dir, exist_ok=True) np.save(os.path.join(save_base_dir, 'train.npy'), train_features) np.save(os.path.join(save_base_dir, 'test.npy'), test_features) with open(os.path.join(save_base_dir, 'text_columns.json'), 'w') as in_f: json.dump(text_feature_columns, in_f)
def test_predictor_fit(key): train_data = load_pd.load(DATA_INFO[key]['train']) dev_data = load_pd.load(DATA_INFO[key]['dev']) label = DATA_INFO[key]['label'] eval_metric = DATA_INFO[key]['metric'] verify_proba = DATA_INFO[key]['verify_proba'] rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label=label, eval_metric=eval_metric) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_score = predictor.evaluate(dev_data) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba) # Test for continuous fit predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba) # Saving to folder, loading the saved model and call fit again (continuous fit) with tempfile.TemporaryDirectory() as root: predictor.save(root) predictor = TextPredictor.load(root) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123)
def test_mixed_column_type(): train_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sts/train.parquet') dev_data = load_pd.load( 'https://autogluon-text.s3-accelerate.amazonaws.com/' 'glue/sts/dev.parquet') rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:1000]] dev_data = dev_data.iloc[valid_perm[:10]] # Add more columns as feature train_data = pd.DataFrame({ 'sentence1': train_data['sentence1'], 'sentence2': train_data['sentence2'], 'sentence3': train_data['sentence2'], 'categorical0': train_data['genre'], 'numerical0': train_data['score'], 'genre': train_data['genre'], 'score': train_data['score'] }) dev_data = pd.DataFrame({ 'sentence1': dev_data['sentence1'], 'sentence2': dev_data['sentence2'], 'sentence3': dev_data['sentence2'], 'categorical0': dev_data['genre'], 'numerical0': dev_data['score'], 'genre': dev_data['genre'], 'score': dev_data['score'] }) # Train Regression predictor1 = TextPredictor(label='score', verbosity=4) predictor1.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse']) verify_predictor_save_load(predictor1, dev_data) # Train Classification predictor2 = TextPredictor(label='genre', verbosity=4) predictor2.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_rmse = predictor2.evaluate(dev_data, metrics=['acc']) verify_predictor_save_load(predictor2, dev_data, verify_proba=True) # Specify the feature column predictor3 = TextPredictor(label='score', verbosity=4) predictor3.fit( train_data[['sentence1', 'sentence3', 'categorical0', 'score']], hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_rmse = predictor3.evaluate(dev_data, metrics=['rmse']) verify_predictor_save_load(predictor3, dev_data)
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)
class TextPredictionV1Model(AbstractModel): nn_model_name = 'text_nn' def __init__(self, **kwargs): """The TextPredictionV1Model. The features can be a mix of - text column - categorical column - numerical column The labels can be categorical or numerical. Parameters ---------- path The directory to store the modeling outputs. name Name of subdirectory inside path where model will be saved. problem_type Type of problem that this model will handle. Valid options: ['binary', 'multiclass', 'regression']. eval_metric The evaluation metric. num_classes The number of classes. stopping_metric The stopping metric. model The internal model object. hyperparameters The hyperparameters of the model features Names of the features. feature_metadata The feature metadata. debug Whether to turn on debug mode """ super().__init__(**kwargs) self._label_column_name = None def _get_default_auxiliary_params(self) -> dict: default_auxiliary_params = super()._get_default_auxiliary_params() extra_auxiliary_params = dict(get_features_kwargs=dict( valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT], invalid_special_types=[ S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL ], ), ) default_auxiliary_params.update(extra_auxiliary_params) return default_auxiliary_params @classmethod def _get_default_ag_args(cls) -> dict: default_ag_args = super()._get_default_ag_args() extra_ag_args = {'valid_stacker': False} default_ag_args.update(extra_ag_args) return default_ag_args def _set_default_params(self): try: from autogluon.text import ag_text_presets except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) super()._set_default_params() self.params = ag_text_presets.create('default') def _fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, **kwargs): """The internal fit function Parameters ---------- X_train Features of the training dataset y_train Labels of the training dataset X_val Features of the validation dataset y_val Labels of the validation dataset time_limit The time limits for the fit function kwargs Other keyword arguments """ try: import mxnet as mx from autogluon.text import TextPredictor except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) # Decide name of the label column if 'label' in X_train.columns: label_col_id = 0 while True: self._label_column_name = 'label{}'.format(label_col_id) if self._label_column_name not in X_train.columns: break label_col_id += 1 else: self._label_column_name = 'label' X_train = self.preprocess(X_train, fit=True) if X_val is not None: X_val = self.preprocess(X_val) # Get arguments from kwargs verbosity = kwargs.get('verbosity', 2) num_cpus = kwargs.get('num_cpus', None) num_gpus = kwargs.get('num_gpus', None) self.model = TextPredictor(label=self._label_column_name, problem_type=self.problem_type, path=self.path, eval_metric=self.eval_metric, verbosity=verbosity) X_train.insert(len(X_train.columns), self._label_column_name, y_train) if X_val is not None: X_val.insert(len(X_val.columns), self._label_column_name, y_val) assert self.params['hpo_params']['num_trials'] == 1 \ or self.params['hpo_params']['num_trials'] is None params = copy.deepcopy(self.params) params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size']\ = max(1, params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size'] // 2) self.model.fit(train_data=X_train, tuning_data=X_val, time_limit=time_limit, num_gpus=num_gpus, num_cpus=num_cpus, hyperparameters=params, seed=params.get('seed')) def save(self, path: str = None, verbose=True) -> str: model = self.model self.model = None # save this AbstractModel object without NN weights path = super().save(path=path, verbose=verbose) self.model = model text_nn_path = os.path.join(path, self.nn_model_name) model.save(text_nn_path) logger.log( 15, f"\tSaved Text NN weights and model hyperparameters to '{text_nn_path}'." ) return path @classmethod def load(cls, path: str, reset_paths=True, verbose=True): try: from autogluon.text import TextPredictor except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) model = super().load(path=path, reset_paths=reset_paths, verbose=verbose) model.model = TextPredictor.load(os.path.join(path, cls.nn_model_name)) return model def get_memory_size(self) -> int: """Return the memory size by calculating the total number of parameters. Returns ------- memory_size The total memory size in bytes. """ total_size = 0 for k, v in self.model._model.net.collect_params().items(): total_size += np.dtype(v.dtype).itemsize * np.prod(v.shape) return total_size def _get_default_resources(self): num_cpus = get_cpu_count() num_gpus = get_gpu_count() return num_cpus, num_gpus
def _fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, **kwargs): """The internal fit function Parameters ---------- X_train Features of the training dataset y_train Labels of the training dataset X_val Features of the validation dataset y_val Labels of the validation dataset time_limit The time limits for the fit function kwargs Other keyword arguments """ try: import mxnet as mx from autogluon.text import TextPredictor except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) # Decide name of the label column if 'label' in X_train.columns: label_col_id = 0 while True: self._label_column_name = 'label{}'.format(label_col_id) if self._label_column_name not in X_train.columns: break label_col_id += 1 else: self._label_column_name = 'label' X_train = self.preprocess(X_train, fit=True) if X_val is not None: X_val = self.preprocess(X_val) # Get arguments from kwargs verbosity = kwargs.get('verbosity', 2) num_cpus = kwargs.get('num_cpus', None) num_gpus = kwargs.get('num_gpus', None) self.model = TextPredictor(label=self._label_column_name, problem_type=self.problem_type, path=self.path, eval_metric=self.eval_metric, verbosity=verbosity) X_train.insert(len(X_train.columns), self._label_column_name, y_train) if X_val is not None: X_val.insert(len(X_val.columns), self._label_column_name, y_val) assert self.params['hpo_params']['num_trials'] == 1 \ or self.params['hpo_params']['num_trials'] is None params = copy.deepcopy(self.params) params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size']\ = max(1, params['models']['MultimodalTextModel']['search_space']['optimization.per_device_batch_size'] // 2) self.model.fit(train_data=X_train, tuning_data=X_val, time_limit=time_limit, num_gpus=num_gpus, num_cpus=num_cpus, hyperparameters=params, seed=params.get('seed'))
class TextPredictorModel(AbstractModel): nn_model_name = 'text_nn' def __init__(self, **kwargs): """Wrapper of autogluon.text.TextPredictor. The features can be a mix of - text column - categorical column - numerical column The labels can be categorical or numerical. Parameters ---------- path The directory to store the modeling outputs. name Name of subdirectory inside path where model will be saved. problem_type Type of problem that this model will handle. Valid options: ['binary', 'multiclass', 'regression']. eval_metric The evaluation metric. num_classes The number of classes. stopping_metric The stopping metric. model The internal model object. hyperparameters The hyperparameters of the model features Names of the features. feature_metadata The feature metadata. """ super().__init__(**kwargs) self._label_column_name = None self._load_model = None # Whether to load inner model when loading. def _get_default_auxiliary_params(self) -> dict: default_auxiliary_params = super()._get_default_auxiliary_params() extra_auxiliary_params = dict(get_features_kwargs=dict( valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT], invalid_special_types=[ S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL ], ), ) default_auxiliary_params.update(extra_auxiliary_params) return default_auxiliary_params @classmethod def _get_default_ag_args(cls) -> dict: default_ag_args = super()._get_default_ag_args() extra_ag_args = {'valid_stacker': False} default_ag_args.update(extra_ag_args) return default_ag_args def _set_default_params(self): super()._set_default_params() try_import_autogluon_text() from autogluon.text import ag_text_presets self.params = ag_text_presets.create('default') def _fit(self, X: pd.DataFrame, y: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, sample_weight=None, **kwargs): """The internal fit function Parameters ---------- X Features of the training dataset y Labels of the training dataset X_val Features of the validation dataset y_val Labels of the validation dataset time_limit The time limits for the fit function kwargs Other keyword arguments """ try_import_mxnet() try_import_autogluon_text() from autogluon.text import TextPredictor # Decide name of the label column if 'label' in X.columns: label_col_id = 0 while True: self._label_column_name = 'label{}'.format(label_col_id) if self._label_column_name not in X.columns: break label_col_id += 1 else: self._label_column_name = 'label' X_train = self.preprocess(X, fit=True) if X_val is not None: X_val = self.preprocess(X_val) # Get arguments from kwargs verbosity = kwargs.get('verbosity', 2) num_cpus = kwargs.get('num_cpus', None) num_gpus = kwargs.get('num_gpus', None) if sample_weight is not None: # TODO: support logger.log( 15, "sample_weight not yet supported for TextPredictorModel, this model will ignore them in training." ) X_train.insert(len(X_train.columns), self._label_column_name, y) if X_val is not None: X_val.insert(len(X_val.columns), self._label_column_name, y_val) assert self.params['tune_kwargs']['num_trials'] == 1 \ or self.params['tune_kwargs']['num_trials'] is None,\ 'Currently, you cannot nest the hyperparameter search in text neural network ' \ 'and the AutoGluon Tabular.' verbosity_text = max(0, verbosity - 1) root_logger = logging.getLogger() root_log_level = root_logger.level self.model = TextPredictor(label=self._label_column_name, problem_type=self.problem_type, path=self.path, eval_metric=self.eval_metric, verbosity=verbosity_text) self.model.fit(train_data=X_train, tuning_data=X_val, time_limit=time_limit, num_gpus=num_gpus, num_cpus=num_cpus, hyperparameters=self.params, seed=self.params.get('seed', 0)) self.model.set_verbosity(verbosity) root_logger.setLevel(root_log_level) # Reset log level def save(self, path: str = None, verbose=True) -> str: self._load_model = self.model is not None __model = self.model self.model = None # save this AbstractModel object without NN weights path = super().save(path=path, verbose=verbose) self.model = __model if self._load_model: text_nn_path = os.path.join(path, self.nn_model_name) self.model.save(text_nn_path) logger.log( 15, f"\tSaved Text NN weights and model hyperparameters to '{text_nn_path}'." ) self._load_model = None return path @classmethod def load(cls, path: str, reset_paths=True, verbose=True): model = super().load(path=path, reset_paths=reset_paths, verbose=verbose) if model._load_model: try_import_autogluon_text() from autogluon.text import TextPredictor model.model = TextPredictor.load( os.path.join(path, cls.nn_model_name)) model._load_model = None return model def get_memory_size(self) -> int: """Return the memory size by calculating the total number of parameters. Returns ------- memory_size The total memory size in bytes. """ total_size = 0 for k, v in self.model._model.net.collect_params().items(): total_size += np.dtype(v.dtype).itemsize * np.prod(v.shape) return total_size def _get_default_resources(self): num_cpus = get_cpu_count() num_gpus = get_gpu_count() return num_cpus, num_gpus def _predict_proba(self, X, **kwargs): X = self.preprocess(X, **kwargs) if self.problem_type == REGRESSION: return self.model.predict(X, as_pandas=False) y_pred_proba = self.model.predict_proba(X, as_pandas=False) return self._convert_proba_to_unified_form(y_pred_proba)
def _fit(self, X: pd.DataFrame, y: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, sample_weight=None, **kwargs): """The internal fit function Parameters ---------- X Features of the training dataset y Labels of the training dataset X_val Features of the validation dataset y_val Labels of the validation dataset time_limit The time limits for the fit function kwargs Other keyword arguments """ try_import_mxnet() try_import_autogluon_text() from autogluon.text import TextPredictor # Decide name of the label column if 'label' in X.columns: label_col_id = 0 while True: self._label_column_name = 'label{}'.format(label_col_id) if self._label_column_name not in X.columns: break label_col_id += 1 else: self._label_column_name = 'label' X_train = self.preprocess(X, fit=True) if X_val is not None: X_val = self.preprocess(X_val) # Get arguments from kwargs verbosity = kwargs.get('verbosity', 2) num_cpus = kwargs.get('num_cpus', None) num_gpus = kwargs.get('num_gpus', None) if sample_weight is not None: # TODO: support logger.log( 15, "sample_weight not yet supported for TextPredictorModel, this model will ignore them in training." ) X_train.insert(len(X_train.columns), self._label_column_name, y) if X_val is not None: X_val.insert(len(X_val.columns), self._label_column_name, y_val) assert self.params['tune_kwargs']['num_trials'] == 1 \ or self.params['tune_kwargs']['num_trials'] is None,\ 'Currently, you cannot nest the hyperparameter search in text neural network ' \ 'and the AutoGluon Tabular.' verbosity_text = max(0, verbosity - 1) root_logger = logging.getLogger() root_log_level = root_logger.level self.model = TextPredictor(label=self._label_column_name, problem_type=self.problem_type, path=self.path, eval_metric=self.eval_metric, verbosity=verbosity_text) self.model.fit(train_data=X_train, tuning_data=X_val, time_limit=time_limit, num_gpus=num_gpus, num_cpus=num_cpus, hyperparameters=self.params, seed=self.params.get('seed', 0)) self.model.set_verbosity(verbosity) root_logger.setLevel(root_log_level) # Reset log level
def run(args): if args.task == 'product_sentiment': train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file, args.test_file) elif args.task == 'mercari_price': train_df, test_df, label_column = load_mercari_price_prediction(args.train_file, args.test_file) elif args.task == 'price_of_books': train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file) elif args.task == 'data_scientist_salary': train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file) else: raise NotImplementedError hyperparameters = get_hyperparameter_config('multimodal') if args.preset is not None and args.mode in ['stacking', 'weighted']: hyperparameters['AG_TEXT_NN']['presets'] = args.preset if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters, num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters) elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, presets=args.preset, seed=args.seed) else: raise NotImplementedError if args.task == 'product_sentiment': test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True) test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) elif args.task == 'data_scientist_salary': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = predictions submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'price_of_books': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = np.power(10, predictions) - 1 submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'mercari_price': test_predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_csv(args.sample_submission) submission.loc[:, label_column] = np.exp(test_predictions) - 1 submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) else: raise NotImplementedError
predictor = TabularPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) if args.ensemble_type == 'weighted': predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, hyperparameters=tabular_hparams) else: predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, num_bag_folds=5, num_stack_levels=1, hyperparameters=tabular_hparams) predictor.save() else: predictor = TextPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) predictor.fit(concat_df[feature_columns + [label_columns[0]]], presets='electra_base_late_fusion_concate_e10_avg3') predictor.save( os.path.join(args.save_dir, args.model_type, time_str, 'text_prediction')) predictions = predictor.predict(competition_df, as_pandas=True) predictions.to_csv( os.path.join(args.save_dir, args.model_type, time_str, 'pred.csv'))
def test_distillation(): train_data = load_pd.load( "https://autogluon-text.s3-accelerate.amazonaws.com/" "glue/sst/train.parquet") test_data = load_pd.load( "https://autogluon-text.s3-accelerate.amazonaws.com/" "glue/sst/dev.parquet") rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) test_perm = rng_state.permutation(len(test_data)) train_data = train_data.iloc[train_perm[:100]] test_data = test_data.iloc[test_perm[:10]] teacher_predictor = TextPredictor(label="label", eval_metric="acc") hyperparameters = { "model.hf_text.checkpoint_name": "prajjwal1/bert-tiny", "env.num_workers": 0, "env.num_workers_evaluation": 0, } teacher_save_path = os.path.join("sst", "teacher") if os.path.exists(teacher_save_path): shutil.rmtree(teacher_save_path) teacher_predictor = teacher_predictor.fit( train_data=train_data, hyperparameters=hyperparameters, time_limit=30, save_path=teacher_save_path, ) # test for distillation predictor = TextPredictor(label="label", eval_metric="acc") student_save_path = os.path.join("sst", "student") if os.path.exists(student_save_path): shutil.rmtree(student_save_path) predictor = predictor.fit( train_data=train_data, teacher_predictor=teacher_predictor, hyperparameters=hyperparameters, time_limit=30, save_path=student_save_path, ) verify_predictor_save_load(predictor, test_data) # test for distillation with teacher predictor path predictor = TextPredictor(label="label", eval_metric="acc") student_save_path = os.path.join("sst", "student") if os.path.exists(student_save_path): shutil.rmtree(student_save_path) predictor = predictor.fit( train_data=train_data, teacher_predictor=teacher_predictor.path, hyperparameters=hyperparameters, time_limit=30, save_path=student_save_path, ) verify_predictor_save_load(predictor, test_data)