예제 #1
0
    def test_load_json(self, name):
        empty_json_fp = Path(self._json_data_dir(),
                             'empty_target_instance.json')
        empty_collection = TargetTextCollection.load_json(empty_json_fp,
                                                          name=name)
        assert TargetTextCollection() == empty_collection
        assert empty_collection.name == name

        # Ensure that it raises an error when loading a bad json file
        wrong_json_fp = Path(self._json_data_dir(),
                             'wrong_target_instance.json')
        with pytest.raises(ValueError):
            TargetTextCollection.load_json(wrong_json_fp, name=name)
        # Ensure that it can load a single target text instance correctly
        one_target_json_fp = Path(self._json_data_dir(),
                                  'one_target_instance.json')
        one_target_collection = TargetTextCollection.load_json(
            one_target_json_fp)
        assert len(one_target_collection) == 1
        assert one_target_collection['0'][
            'text'] == 'The laptop case was great and cover was rubbish'
        assert one_target_collection['0']['target_sentiments'] == [0]
        assert one_target_collection['0']['category_sentiments'] == ['pos']
        assert one_target_collection['0']['categories'] == ['LAPTOP#CASE']
        assert one_target_collection['0']['spans'] == [Span(4, 15)]
        assert one_target_collection['0']['targets'] == ['laptop case']

        # Ensure that it can load multiple target text instances
        two_target_json_fp = Path(self._json_data_dir(),
                                  'one_target_one_empty_instance.json')
        two_target_collection = TargetTextCollection.load_json(
            two_target_json_fp)
        assert len(two_target_collection) == 2
예제 #2
0
    def test_to_json_file(self):
        test_collection = TargetTextCollection()
        with tempfile.NamedTemporaryFile(mode='w+') as temp_fp:
            temp_path = Path(temp_fp.name)
            test_collection.to_json_file(temp_path)
            assert len(TargetTextCollection.load_json(temp_path)) == 0

            # Ensure that it can load more than one Target Text examples
            test_collection = TargetTextCollection(
                self._target_text_examples())
            test_collection.to_json_file(temp_path)
            assert len(TargetTextCollection.load_json(temp_path)) == 3

            # Ensure that if it saves to the same file that it overwrites that
            # file
            test_collection = TargetTextCollection(
                self._target_text_examples())
            test_collection.to_json_file(temp_path)
            assert len(TargetTextCollection.load_json(temp_path)) == 3

            # Ensure that it can just load one examples
            test_collection = TargetTextCollection(
                [self._target_text_example()])
            test_collection.to_json_file(temp_path)
            assert len(TargetTextCollection.load_json(temp_path)) == 1
예제 #3
0
def run_model(train_fp: Path,
              val_fp: Path,
              test_fp: Path,
              config_fp: Path,
              number_runs: int,
              prediction_key: str,
              save_dir: Optional[Path] = None,
              only_produce_model: bool = False) -> None:
    '''
    :param train_fp: Path to file that contains JSON formatted training data
    :param val_fp: Path to file that contains JSON formatted validation data
    :param test_fp: Path to file that contains JSON formatted testing data
    :param config_fp: Path to file that contains the models configuration
    :param number_runs: Number of times to run the model
    :param prediction_key: The key to save the predictions to within the 
                           validation and test data
    :param save_dir: Path to save the model to.
    :param only_produce_model: Whether or not to run the model after all the 
                               predictions have been made to save a model to 
                               the `save_dir`
    '''
    # Test if all the predictions have already been made
    temp_test_data = TargetTextCollection.load_json(test_fp)
    temp_test_value = next(temp_test_data.dict_iterator())
    predictions_left = number_runs
    if prediction_key in temp_test_value:
        number_runs_done = len(temp_test_value[prediction_key])
        predictions_left = predictions_left - number_runs_done
        if number_runs_done >= number_runs and not only_produce_model:
            print('Predictions have already been made')
            return
    train_data = TargetTextCollection.load_json(train_fp)
    val_data = TargetTextCollection.load_json(val_fp)
    test_data = TargetTextCollection.load_json(test_fp)

    key_mappings = {'sentiments': prediction_key}

    if only_produce_model:
        model = AllenNLPModel('model',
                              config_fp,
                              save_dir=save_dir,
                              predictor_name='target-sentiment')
        model.fit(train_data, val_data, test_data)

    for run in range(predictions_left):
        print(f'Run number {run}')
        if run == 0 and predictions_left == number_runs:
            model = AllenNLPModel('model',
                                  config_fp,
                                  save_dir=save_dir,
                                  predictor_name='target-sentiment')
        else:
            model = AllenNLPModel('model',
                                  config_fp,
                                  predictor_name='target-sentiment')
        model.fit(train_data, val_data, test_data)
        model.predict_into_collection(val_data, key_mapping=key_mappings)
        model.predict_into_collection(test_data, key_mapping=key_mappings)
    val_data.to_json_file(val_fp, include_metadata=True)
    test_data.to_json_file(test_fp, include_metadata=True)
예제 #4
0
    def test_predict_into_collection(self, batch_size: Optional[int],
                                     append_if_exists: bool):
        # Test that it raises an Error when the model attribute is not None
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        model.load()
        # Test the normal case
        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        key_mappings = {'tags': 'predicted_tags', 'words': 'predicted_tokens'}
        train_data = model.predict_into_collection(train_data, key_mappings,
                                                   batch_size,
                                                   append_if_exists)
        for target_data in train_data.values():
            assert 'predicted_tags' in target_data
            assert 'tags' not in target_data
            assert 'predicted_tokens' in target_data
            assert 'tokens' not in target_data

            target_tokens = target_data['tokenized_text']
            assert len(target_tokens) == len(target_data['predicted_tags'][0])
            assert len(target_tokens) == len(
                target_data['predicted_tokens'][0])
            assert target_tokens == target_data['predicted_tokens'][0]
        # This should be fine when append_if_exists is True and KeyError other
        # wise.
        if append_if_exists:
            train_data = model.predict_into_collection(train_data,
                                                       key_mappings,
                                                       batch_size,
                                                       append_if_exists)
            for target_data in train_data.values():
                target_tokens = target_data['tokenized_text']
                assert 2 == len(target_data['predicted_tags'])
                assert target_data['predicted_tags'][0] == target_data[
                    'predicted_tags'][1]
                assert target_tokens == target_data['predicted_tokens'][0]
                assert target_tokens == target_data['predicted_tokens'][1]
        else:
            with pytest.raises(KeyError):
                train_data = model.predict_into_collection(
                    train_data, key_mappings, batch_size, append_if_exists)
        # Raise a KeyError when the `key_mappings` values are not within the
        # TargetText
        from collections import OrderedDict
        key_mappings = OrderedDict([('tags', 'predicted_tags'),
                                    ('wordss', 'predicted_tokens')])
        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        with pytest.raises(KeyError):
            train_data = model.predict_into_collection(train_data,
                                                       key_mappings,
                                                       batch_size,
                                                       append_if_exists)
        for target_data in train_data.values():
            assert 'predicted_tags' not in target_data
            assert 'predicted_tokens' not in target_data
예제 #5
0
def run_model(train_fp: Path,
              val_fp: Path,
              test_fp: Path,
              config_fp: Path,
              save_test_fp: Path,
              number_runs: int,
              model_save_dir: Optional[Path] = None,
              save_val_fp: Optional[Path] = None) -> None:
    '''
    :param train_fp: Path to file that contains JSON formatted training data
    :param val_fp: Path to file that contains JSON formatted validation data
    :param test_fp: Path to file that contains JSON formatted testing data
    :param config_fp: Path to file that contains the models configuration
    :param save_test_fp: Path to save the test data results
    :param number_runs: Number of times to run the model
    :param model_save_dir: Path to save the first trained model (optional)
    :param save_val_fp: Path to save the validation data results (optional)
    '''
    train_data = TargetTextCollection.load_json(train_fp)
    val_data = TargetTextCollection.load_json(val_fp)
    test_data = TargetTextCollection.load_json(test_fp)

    test_prediction_data = list(test_data.dict_iterator())
    if save_val_fp:
        val_prediction_data = list(val_data.dict_iterator())

    for run in range(number_runs):
        if run == 0 and model_save_dir:
            model = AllenNLPModel('model',
                                  config_fp,
                                  predictor_name='target-sentiment',
                                  save_dir=model_save_dir)
        else:
            model = AllenNLPModel('model',
                                  config_fp,
                                  predictor_name='target-sentiment')
        model.fit(train_data, val_data, test_data)
        predict_on(model, test_prediction_data, test_data)
        if save_val_fp:
            predict_on(model, val_prediction_data, val_data)
    test_data.to_json_file(save_test_fp)
    if save_val_fp:
        val_data.to_json_file(save_val_fp)
예제 #6
0
    def test_target_extraction_fit(self, test_data: bool):

        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger')
        assert model.model is None

        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        val_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)

        tokens_in_vocab = [
            'at', 'case', 'was', 'the', 'day', 'great', 'cover', 'office',
            'another', 'and', 'rubbish', 'laptop', '@@PADDING@@', '@@UNKNOWN@@'
        ]
        if test_data:
            tokens_in_vocab = tokens_in_vocab + ['better']
            test_data = TargetTextCollection.load_json(
                self.TARGET_EXTRACTION_TEST_DATA)
            model.fit(train_data, val_data, test_data)
        else:
            model.fit(train_data, val_data)

        token_index = model.model.vocab.get_token_to_index_vocabulary('tokens')
        assert len(token_index) == len(tokens_in_vocab)
        for token in tokens_in_vocab:
            assert token in token_index

        # Check attributes have changed.
        assert model.model is not None
        assert isinstance(model.model, Model)

        # Check that it will save to a directory of our choosing
        with tempfile.TemporaryDirectory() as save_dir:
            saved_model_fp = Path(save_dir, 'model.tar.gz')
            assert not saved_model_fp.exists()
            model = AllenNLPModel('TE',
                                  self.CONFIG_FILE,
                                  'target-tagger',
                                  save_dir=save_dir)
            model.fit(train_data, val_data)
            assert saved_model_fp.exists()
    prediction_key_rename = {
        'predicted_target_sentiment_average_GloVe':
        'predicted_target_sentiment_CNN_GloVe_None_None',
        'predicted_target_sentiment_average_CWR':
        'predicted_target_sentiment_CNN_CWR_None_None'
    }
    for dataset_name in dataset_names:
        new_dataset_result_folder = Path(new_data_dir,
                                         f'{dataset_name}_dataset')
        existing_dataset_result_folder = Path(existing_data_dir,
                                              f'{dataset_name}_dataset')

        for split_name in split_names:
            new_split_data = Path(new_dataset_result_folder,
                                  f'{split_name}.json')
            new_target_collection = TargetTextCollection.load_json(
                new_split_data)
            exist_split_data = Path(existing_dataset_result_folder,
                                    f'{split_name}.json')
            exist_target_collection = TargetTextCollection.load_json(
                exist_split_data)

            id_results = defaultdict(dict)
            for text_id, target_text in new_target_collection.items():
                for prediction_key in prediction_keys_to_copy:
                    cnn_result = target_text[prediction_key]
                    id_results[text_id][prediction_key] = cnn_result

            len_err = ('The number of keys that the merge predictions are'
                       f' associated with {len(id_results)} are not the same '
                       'number of keys as the existing predictions dataset '
                       f'contains {len(exist_target_collection)}.')
예제 #8
0
                        type=str,
                        help='Metric to be used on the error subsets')
    args = parser.parse_args()
    results_dir = args.results_dir
    save_fp = args.save_fp
    subset_metric_func = getattr(sentiment_metrics, args.subset_metric)
    # Get the data
    data_splits = ['test', 'val']
    dataset_names = ['election', 'laptop', 'restaurant']
    index_keys = ['prediction key', 'run number']

    all_dfs: List[pd.DataFrame] = []
    training_datasets = {}
    for dataset_name in dataset_names:
        data_fp = Path(results_dir, f'{dataset_name}_dataset', 'train.json')
        training_datasets[dataset_name] = TargetTextCollection.load_json(
            data_fp)

    formatted_data_split = {'test': 'Test', 'val': 'Validation'}

    import time
    overall_time = time.time()
    for data_split in data_splits:
        print(f'Data Split {data_split}')
        for dataset_name in dataset_names:
            one_time = time.time()
            print(f'Dataset {dataset_name}')
            data_fp = Path(results_dir, f'{dataset_name}_dataset',
                           f'{data_split}.json')
            test_collection = TargetTextCollection.load_json(data_fp)
            metric_df = overall_metric_results(
                test_collection,
예제 #9
0
    nt_error_names = ERROR_SPLIT_SUBSET_NAMES['NT']
    ds_error_names = ERROR_SPLIT_SUBSET_NAMES['DS']
    tssr_error_names = ERROR_SPLIT_SUBSET_NAMES['TSSR']
    reduced_collection_subset_names = ds_error_names + tssr_error_names
    nt_split_subsets = {'NT': ERROR_SPLIT_SUBSET_NAMES['NT']}

    import time
    overall_time = time.time()
    for dataset_name in dataset_names:
        print(f'Dataset {dataset_name}')
        for split in splits:
            one_time = time.time()
            print(f'Data Split {split}')
            data_fp = Path(results_dir, f'{dataset_name.lower()}_dataset',
                           f'{split}.json')
            dataset = TargetTextCollection.load_json(data_fp)
            for error_func in all_relevant_error_funcs:
                error_func(None, dataset, True)
            for reduced_collection_subset_name in reduced_collection_subset_names:
                temp_df = error_split_df(
                    None,
                    dataset,
                    relevant_prediction_keys,
                    'target_sentiments',
                    nt_split_subsets,
                    accuracy, {'ignore_label_differences': True},
                    include_dataset_size=True,
                    collection_subsetting=[[reduced_collection_subset_name]],
                    table_format_return=False)
                temp_df = add_metadata_to_df(temp_df, dataset,
                                             'predicted_target_sentiment_key')
            print('Not overwriting the data')

    # Process the training, validation, and test data
    split_names = ['train', 'val', 'train_val', 'test']
    split_name_dataset = []
    train_size = 0
    val_size = 0
    train_text_sentiment_distribution = Counter()
    val_text_sentiment_distribution = Counter()
    train_target_sentiment_distribution = Counter()
    val_target_sentiment_distribution = Counter()
    for split_name in split_names:
        data_fp = Path(args.dataset_dir, f'{split_name}.json')
        if split_name == 'train_val':
            data_fp = Path(args.dataset_dir, f'val.json')
        dataset = TargetTextCollection.load_json(data_fp)
        if split_name == 'train':
            train_target_sentiment_distribution = get_target_sentiment_distribution(dataset)
        if split_name == 'train_val':
            val_target_sentiment_distribution = get_target_sentiment_distribution(dataset)

        if split_name == 'train' and average_sentiment:
            dataset.one_sentiment_text('target_sentiments', average_sentiment)
        elif split_name == 'train' and not average_sentiment:
            dataset = remove_multi_sentiment_targets(dataset)

        if split_name == 'train_val' and average_sentiment:
            dataset.one_sentiment_text('target_sentiments', average_sentiment)
        elif split_name == 'train_val' and not average_sentiment:
            dataset = remove_multi_sentiment_targets(dataset)
        
    'election': 'Election',
    'laptop': 'Laptop',
    'restaurant': 'Restaurant'
}
split_name_mapper = {'train': 'Train', 'val': 'Validation', 'test': 'Test'}

all_dataset_names = []
all_split_names = []
targets_in_split = []

for dataset_name in dataset_names:
    dataset_dir = data_dir / f'{dataset_name}_dataset'
    total_targets_datasets = 0
    for split_name in split_names:
        data_fp = dataset_dir / f'{split_name}.json'
        total_targets_datasets += TargetTextCollection.load_json(
            data_fp).number_targets()
    for split_name in split_names:
        data_fp = dataset_dir / f'{split_name}.json'
        num_targets = TargetTextCollection.load_json(data_fp).number_targets()
        num_targets = f'{num_targets} ({(num_targets/total_targets_datasets)*100:.2f}%)'
        targets_in_split.append(num_targets)
        all_split_names.append(split_name_mapper[split_name])
        all_dataset_names.append(data_name_mapper[dataset_name])
    targets_in_split.append(total_targets_datasets)
    all_split_names.append('Total')
    all_dataset_names.append(data_name_mapper[dataset_name])
import pandas as pd
stats_df = pd.DataFrame({
    'Dataset': all_dataset_names,
    'Data Split': all_split_names,
    'values': targets_in_split
    path_string = Path(path_string).resolve()
    return path_string


if __name__ == '__main__':
    save_dir_help = ('File Path to directory where the anonymised results '
                     'will be saved.')
    results_dir_help = ('File path to the directory that currently stores all '
                        'results')
    parser = argparse.ArgumentParser()
    parser.add_argument("results_dir", type=parse_path, help=results_dir_help)
    parser.add_argument("save_dir", type=parse_path, help=save_dir_help)
    args = parser.parse_args()

    save_dir = args.save_dir
    results_dir = args.results_dir
    save_dir.mkdir(parents=True, exist_ok=True)

    dataset_names = ['election', 'laptop', 'restaurant']
    split_names = ['train', 'val', 'test']
    for dataset_name in dataset_names:
        dataset_result_folder = Path(results_dir, f'{dataset_name}_dataset')
        save_dataset_folder = Path(save_dir, f'{dataset_name}_dataset')
        save_dataset_folder.mkdir(parents=True, exist_ok=True)
        for split_name in split_names:
            split_fp = Path(dataset_result_folder, f'{split_name}.json')
            split_dataset = TargetTextCollection.load_json(split_fp)
            split_dataset: TargetTextCollection
            split_dataset.anonymised = True
            save_fp = Path(save_dataset_folder, f'{split_name}.json')
            split_dataset.to_json_file(save_fp, include_metadata=True)
예제 #13
0
import pandas as pd

from target_extraction.data_types import TargetTextCollection, TargetText
from target_extraction.data_types_util import Span
from target_extraction.analysis.dataset_statistics import get_sentiment_counts
from target_extraction.analysis.dataset_statistics import average_target_per_sentences
from target_extraction.analysis.dataset_statistics import dataset_target_sentiment_statistics
from target_extraction.analysis.dataset_statistics import tokens_per_target
from target_extraction.analysis.dataset_statistics import dataset_target_extraction_statistics
from target_extraction.analysis.dataset_statistics import _statistics_to_dataframe
from target_extraction.analysis.dataset_statistics import tokens_per_sentence
from target_extraction.tokenizers import whitespace

DATA_DIR = Path(__file__, '..', '..', 'data', 'analysis',
                'sentiment_error_analysis').resolve()
TRAIN_COLLECTION = TargetTextCollection.load_json(
    Path(DATA_DIR, 'train_with_blank.json'))
TRAIN_COLLECTION.name = 'train'
SENTIMENT_KEY = 'target_sentiments'


def test_get_sentiment_counts():
    num_pos = 2
    num_neu = 12
    num_neg = 5
    total = 19.0
    true_sentiment_counts = dict([('positive', num_pos), ('neutral', num_neu),
                                  ('negative', num_neg)])
    sentiment_counts = get_sentiment_counts(TRAIN_COLLECTION,
                                            normalised=False,
                                            sentiment_key=SENTIMENT_KEY)
    assert len(true_sentiment_counts) == len(sentiment_counts)
예제 #14
0
from pathlib import Path

from target_extraction.data_types import TargetTextCollection


def parse_path(path_string: str) -> Path:
    path_string = Path(path_string).resolve()
    return path_string


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("data_dir", type=parse_path)
    args = parser.parse_args()

    for split_name in ['train.json', 'val.json', 'test.json']:
        split_fp = Path(args.data_dir, split_name)
        collection = TargetTextCollection.load_json(split_fp)
        for value in collection.values():
            try:
                value.re_order(keys_not_to_order=[
                    'pos_tags', 'tokenized_text', 'category_sentiments',
                    'categories'
                ])
            except:
                print(value)
        collection.re_order(keys_not_to_order=[
            'pos_tags', 'tokenized_text', 'category_sentiments', 'categories'
        ])
        assert collection.in_order()
        collection.to_json_file(split_fp, include_metadata=True)
                all_targets.append(aug_target_object)
            except OverLappingTargetsError:
                # This needs to be skipped as when targets overlap it is very 
                # difficult to easily calculate all possible span offsets 
                # for all other targets. Furthermore there are only 3 
                # occasion this happens so it is a very rare occurrence.
                continue
    return all_targets

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("augmented_dataset", type=parse_path, 
                        help='File path the augmented dataset')
    parser.add_argument("save_fp", type=parse_path, 
                        help='File path to save the new re-formated augmented dataset')
    args = parser.parse_args()

    augmented_data_fp = args.augmented_dataset
    save_fp = args.save_fp

    augmented_dataset = TargetTextCollection.load_json(augmented_data_fp)
    new_dataset = []

    for target_object in augmented_dataset.values():
        augmented_targets = add_augmented_targets(target_object, 
                                                  remove_repeats=True)
        new_dataset.extend(augmented_targets)
    new_dataset = TargetTextCollection(new_dataset)
    number_samples = new_dataset.number_targets()
    print(f'The number of samples in the dataset {number_samples}')
    new_dataset.to_json_file(save_fp)
예제 #16
0
        config_params.to_file(temp_file.name)
        only_produce_model = True if args.run_to_get_saved_model else False
        run_model(train_fp,
                  train_val_fp,
                  val_fp,
                  test_fp,
                  Path(temp_file.name),
                  args.N,
                  prediction_key,
                  save_dir=model_dir_save_dir,
                  only_produce_model=only_produce_model)

    # Add metadata and ensure that the datasets have there correct name
    # associated to them
    for split_name, data_fp in [('Validation', val_fp), ('Test', test_fp)]:
        data = TargetTextCollection.load_json(data_fp, name=args.domain)
        metadata = data.metadata if data.metadata else {}
        metadata['split'] = split_name
        sentiment_key_metadata = {}
        sentiment_key_metadata_key = 'predicted_target_sentiment_key'
        if 'predicted_target_sentiment_key' in metadata:
            sentiment_key_metadata = metadata['predicted_target_sentiment_key']
        key_metadata = {}
        key_metadata['CWR'] = True if args.cwr else False
        key_metadata['Position'] = True if args.position else False
        key_metadata[
            'Inter-Aspect'] = args.inter_aspect if args.inter_aspect else False
        key_metadata['Model'] = args.model_name
        key_metadata['data-trained-on'] = data_trained_on_name
        sentiment_key_metadata[prediction_key] = key_metadata
        metadata['predicted_target_sentiment_key'] = sentiment_key_metadata
예제 #17
0
                        help='File path JSON training data')
    parser.add_argument("glove_embedding_fp",
                        type=parse_path,
                        help=glove_fp_help)
    parser.add_argument("target_only_word2vec_path",
                        type=parse_path,
                        help='File path to save the embedding too.')
    args = parser.parse_args()

    save_fp = args.target_only_word2vec_path
    if save_fp.exists():
        print('A file already exists at the location to store '
              f'the new Word2Vec model/vector: {save_fp}\n'
              'Thus skipping the rest of this script.')
    else:
        dataset = TargetTextCollection.load_json(args.json_train_data)
        all_targets = list(dataset.target_count(lower=True).keys())
        tokenizer = spacy_tokenizer()
        tokenised_targets = [
            target for targets in all_targets for target in tokenizer(targets)
        ]
        with tempfile.TemporaryDirectory() as temp_dir:
            shrink_glove_temp_fp = Path(temp_dir, 'temp_glove')
            shrink_word_vec_temp_fp = Path(temp_dir, 'temp_wordvec')
            shrink_glove_file(args.glove_embedding_fp, tokenised_targets,
                              shrink_glove_temp_fp)
            glove2word2vec(shrink_glove_temp_fp, shrink_word_vec_temp_fp)

            model = KeyedVectors.load_word2vec_format(shrink_word_vec_temp_fp)
            model.save(str(save_fp))
        print(f'Word2Vec shrunk to target model saved to {save_fp}')
예제 #18
0
def run_model(train_fp: Path,
              train_val_fp: Path,
              val_fp: Path,
              test_fp: Path,
              config_fp: Path,
              number_runs: int,
              prediction_key: str,
              save_dir: Optional[Path] = None,
              only_produce_model: bool = False) -> None:
    '''
    :param train_fp: Path to file that contains JSON formatted training data
    :param train_val_fp: Path to file that contains JSON formatted validation data
                         that will be used to train the model (used for early 
                         stopping).
    :param val_fp: Path to file that contains JSON formatted validation data 
                   that will be predicted on and used to evaluate the text 
                   classification model on the target sentiment task
    :param test_fp: Path to file that contains JSON formatted testing data
                    for target sentiment evaluation.
    :param config_fp: Path to file that contains the models configuration
    :param number_runs: Number of times to run the model
    :param prediction_key: The key to save the predictions to within the 
                           validation and test data
    :param save_dir: Path to save the model to.
    :param only_produce_model: Whether or not to run the model after all the 
                               predictions have been made to save a model to 
                               the `save_dir`
    '''
    # Test if all the predictions have already been made
    temp_test_data = TargetTextCollection.load_json(test_fp)
    temp_test_value = next(temp_test_data.dict_iterator())
    predictions_left = number_runs
    if prediction_key in temp_test_value:
        number_runs_done = len(temp_test_value[prediction_key])
        predictions_left = predictions_left - number_runs_done
        if number_runs_done >= number_runs and not only_produce_model:
            print('Predictions have already been made')
            return
    train_data = TargetTextCollection.load_json(train_fp)
    train_val_data = TargetTextCollection.load_json(train_val_fp)
    val_data = TargetTextCollection.load_json(val_fp)
    test_data = TargetTextCollection.load_json(test_fp)

    if only_produce_model:
        model = AllenNLPModel('model',
                              config_fp,
                              save_dir=save_dir,
                              predictor_name='target-sentiment')
        model.fit(train_data, train_val_data, test_data)

    for run in range(predictions_left):
        print(f'Run number {run}')
        if run == 0 and predictions_left == number_runs:
            model = AllenNLPModel('model',
                                  config_fp,
                                  save_dir=save_dir,
                                  predictor_name='target-sentiment')
        else:
            model = AllenNLPModel('model',
                                  config_fp,
                                  predictor_name='target-sentiment')
        model.fit(train_data, train_val_data, test_data)
        text_classification_prediction(model, val_data, prediction_key)
        text_classification_prediction(model, test_data, prediction_key)
        #for value in model._predict_iter(val_data.dict_iterator(), yield_original_target=True):
        #    prediction_object, target_object = value
        #    predicted_sentiment = prediction_object['label']
        #    true_sentiment = target_object['target_sentiments']
        #    number_sentiments = len(true_sentiment)
        #    predicted_sentiment = [predicted_sentiment] * number_sentiments
        #    text_id = target_object['text_id']
        #    if prediction_key not in val_data[text_id]:
        #        val_data[text_id][prediction_key] = []
        #    val_data[text_id][prediction_key].append(predicted_sentiment)
        #model.predict_into_collection(val_data, key_mapping=key_mappings)
        #model.predict_into_collection(test_data, key_mapping=key_mappings)
    val_data.to_json_file(val_fp, include_metadata=True)
    test_data.to_json_file(test_fp, include_metadata=True)
예제 #19
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "train_augmented_fp",
        type=parse_path,
        help='File path to the training only targets augmented dataset')
    parser.add_argument(
        "predicted_augmented_fp",
        type=parse_path,
        help='File path to the predicted only targets augmented dataset')
    parser.add_argument("save_fp",
                        type=parse_path,
                        help='File path to save the new combined dataset')
    args = parser.parse_args()

    augmented_train_dataset = TargetTextCollection.load_json(
        args.train_augmented_fp)
    augmented_pred_dataset = TargetTextCollection.load_json(
        args.predicted_augmented_fp)
    save_fp = args.save_fp

    assert len(augmented_train_dataset) == len(augmented_pred_dataset)
    assert augmented_train_dataset == augmented_pred_dataset
    assert augmented_pred_dataset == augmented_train_dataset

    combined_target_objects = []
    for text_id, train_target_text in augmented_train_dataset.items():
        pred_target_text = augmented_pred_dataset[text_id]

        train_targets = train_target_text['targets']
        pred_targets = pred_target_text['targets']
        target_not_same_err = ('The target lists have to be the same within '
예제 #20
0
                            "target based on the model's confidence scores"
    save_fp_help = 'File to save the predicted targets to, each target will be '\
                   'saved on a new line'
    parser = argparse.ArgumentParser()
    parser.add_argument("predicted_target_data_fp",
                        type=parse_path,
                        help='File path to the predicted targets')
    parser.add_argument("train_fp", type=parse_path, help=train_fp_help)
    parser.add_argument('confidence_score',
                        type=float,
                        help=confidence_score_help)
    parser.add_argument('save_fp', type=parse_path, help=save_fp_help)
    args = parser.parse_args()

    # Setting the data up
    train_data = TargetTextCollection.load_json(args.train_fp)
    train_targets = set(list(train_data.target_count(lower=True).keys()))

    acceptable_confidence = args.confidence_score
    all_targets: List[TargetText] = []
    with args.predicted_target_data_fp.open('r') as predicted_file:
        for index, line in enumerate(predicted_file):
            target_data = json.loads(line)
            target_id = str(index)
            target_data_dict = {
                'text': target_data['text'],
                'text_id': target_id,
                'confidences': target_data['confidence'],
                'sequence_labels': target_data['sequence_labels'],
                'tokenized_text': target_data['tokens']
            }
예제 #21
0
    # save directory
    encoder = encoder_name(args.elmo, args.elmo_ds, args.word_embedding,
                           args.word_embedding_ds, args.elmo_contextualised)
    print(encoder)
    if 'use_target_sequences' in config_params['model']:
        print(config_params['model']['use_target_sequences'])
    save_dir = Path(args.save_dir, args.model_name, args.domain,
                    encoder).resolve()
    save_dir.mkdir(parents=True, exist_ok=True)
    test_save_fp = Path(save_dir, 'pred_test.json')
    val_save_fp = Path(save_dir, 'pred_val.json')

    dataset_dir = args.dataset_dir
    train_fp = Path(dataset_dir, 'train.json')
    if test_save_fp.exists() and val_save_fp.exists():
        save_test_data = TargetTextCollection.load_json(test_save_fp)
        test_value = next(iter(save_test_data.values()))
        num_predictions = len(test_value['predicted_target_sentiments'])
        if num_predictions >= args.N:
            print(
                f'Predictions have already been made at the following directory {save_dir}'
            )
        else:
            predictions_left = args.N - num_predictions
            print(f'Number of predictions left {predictions_left}')
            with tempfile.NamedTemporaryFile(mode='w+') as temp_file:
                config_params.to_file(temp_file.name)
                run_model(train_fp, val_save_fp, test_save_fp,
                          Path(temp_file.name), test_save_fp, predictions_left,
                          None, val_save_fp)