def test_train_and_test_dataset():
    with tempfile.TemporaryDirectory() as temp_dir:
        # Test both the normal cahce_dir and the given cache dir
        for data_dir in [None, Path(temp_dir, 'twitter data')]:
            train_data = wang_2017_election_twitter_train(data_dir)
            test_data = wang_2017_election_twitter_test(data_dir)
            
            assert len(train_data) > len(test_data)

            combined_data = TargetTextCollection.combine(train_data, test_data)
            assert 11899 == combined_data.number_targets()
Exemplo n.º 2
0
    model = AllenNLPModel(model_name, args.model_config, 'target-tagger',
                          args.model_save_dir)

    if dataset_name == 'semeval_2014':
        if not args.train_fp or not args.test_fp:
            raise ValueError('If training and predicting for the SemEval '
                             'datasets the training and test file paths must '
                             'be given')
        # As we are performing target extraction we use the conflict polarity
        # targets like prior work
        train_data = semeval_2014(args.train_fp, conflict=True)
        test_data = semeval_2014(args.test_fp, conflict=True)
    else:
        temp_election_directory = Path('.', 'data', 'twitter_election_dataset')
        train_data = wang_2017_election_twitter_train(temp_election_directory)
        test_data = wang_2017_election_twitter_test(temp_election_directory)

    if not args.model_save_dir.is_dir():
        # Use the same size validation as the test data
        test_size = len(test_data)
        # Create the train and validation splits
        train_data = list(train_data.values())
        train_data, val_data = train_test_split(train_data,
                                                test_size=test_size)
        train_data = TargetTextCollection(train_data)
        val_data = TargetTextCollection(val_data)
        # Tokenize the data
        datasets = [train_data, val_data, test_data]
        tokenizer = spacy_tokenizer()

        sizes = []
Exemplo n.º 3
0
    parser.add_argument("save_train_fp", type=parse_path, 
                        help='File Path to save the new training dataset to')
    parser.add_argument("save_val_fp", type=parse_path, 
                        help='File Path to save the new validation dataset to')
    parser.add_argument("save_test_fp", type=parse_path, 
                        help='File Path to save the new test dataset to')
    parser.add_argument("--conflict", action="store_true")
    parser.add_argument("--random", action="store_true")
    parser.add_argument("--remove_errors", action="store_true", 
                        help=remove_errors_help)
    args = parser.parse_args()
    
    if args.dataset_name == 'election_twitter':
        print(f'Downloading the Twitter dataset to {CACHE_DIRECTORY}')
        train_dataset: TargetTextCollection = wang_2017_election_twitter_train()
        test_dataset: TargetTextCollection = wang_2017_election_twitter_test()
    else:
        dataset_name_parser = {name: parser for name, parser in 
                           zip(valid_dataset_names, dataset_parsers)}
        dataset_parser = dataset_name_parser[args.dataset_name]
        train_dataset: TargetTextCollection  = dataset_parser(args.train_dataset_fp, 
                                                            conflict=args.conflict)
        test_dataset: TargetTextCollection = dataset_parser(args.test_dataset_fp, 
                                                            conflict=args.conflict)

    if args.dataset_name == 'semeval_2016':
        train_dataset = train_dataset.one_sample_per_span(remove_empty=True)
        test_dataset = test_dataset.one_sample_per_span(remove_empty=True)
    # If the task is sentiment prediction remove all of the sentences that 
    # do not have targets
    if args.task == 'sentiment':
Exemplo n.º 4
0
import tempfile
from pathlib import Path
from time import time

from target_extraction.dataset_parsers import wang_2017_election_twitter_train, wang_2017_election_twitter_test

with tempfile.TemporaryDirectory() as temp_dir:
    another = Path(temp_dir, 'first')
    t = time()
    a = wang_2017_election_twitter_train(another)
    a_t = a.number_targets()
    b = wang_2017_election_twitter_test(another)
    b_t = b.number_targets()
    print('done')