Exemplo n.º 1
0
def test_language_model_data_collator():
    """
    Ensure `LanguageModelingDataCollator` works
    """
    norm_loader = MultiProcessDataLoader(MockDatasetReader(),
                                         "some path",
                                         batch_size=16)
    vocab = Vocabulary.from_instances(norm_loader.iter_instances())
    norm_loader.index_with(vocab)
    batch0 = list(norm_loader)[0]

    model_name = "epwalsh/bert-xsmall-dummy"
    data_collate = LanguageModelingDataCollator(model_name)
    mlm_loader = MultiProcessDataLoader(MockDatasetReader(),
                                        "some path",
                                        batch_size=16,
                                        collate_fn=data_collate)
    vocab = Vocabulary.from_instances(mlm_loader.iter_instances())
    mlm_loader.index_with(vocab)
    batch1 = list(mlm_loader)[0]

    norm_inputs = batch0["source"]["tokens"]["token_ids"]
    mlm_inputs = batch1["source"]["tokens"]["token_ids"]
    mlm_labels = batch1["source"]["tokens"]["labels"]

    # if we replace the mlm inputs with their labels, should be same as origin inputs
    assert torch.where(mlm_labels != -100, mlm_labels,
                       mlm_inputs).tolist() == norm_inputs.tolist()
Exemplo n.º 2
0
    def test_from_dataset_respects_inclusive_embedding_file(self):
        embeddings_filename = self.TEST_DIR + "embeddings.gz"
        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
            embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

        vocab = Vocabulary.from_instances(
            self.dataset,
            min_count=4,
            pretrained_files={'tokens': embeddings_filename},
            only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(
            self.dataset,
            min_count=-1,
            pretrained_files={'tokens': embeddings_filename},
            only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
Exemplo n.º 3
0
    def test_from_dataset_respects_exclusive_embedding_file(self):
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, "wb") as embeddings_file:
            embeddings_file.write("a 1.0 2.3 -1.0\n".encode("utf-8"))
            embeddings_file.write("b 0.1 0.4 -4.0\n".encode("utf-8"))

        vocab = Vocabulary.from_instances(
            self.dataset,
            min_count={"tokens": 4},
            pretrained_files={"tokens": embeddings_filename},
            only_include_pretrained_words=True,
        )
        words = vocab.get_index_to_token_vocabulary().values()
        assert "a" in words
        assert "b" not in words
        assert "c" not in words

        vocab = Vocabulary.from_instances(
            self.dataset,
            pretrained_files={"tokens": embeddings_filename},
            only_include_pretrained_words=True,
        )
        words = vocab.get_index_to_token_vocabulary().values()
        assert "a" in words
        assert "b" in words
        assert "c" not in words
Exemplo n.º 4
0
    def test_from_instances_exclusive_embeddings_file_inside_archive(self):
        """ Just for ensuring there are no problems when reading pretrained tokens from an archive """
        # Read embeddings file from archive
        archive_path = str(self.TEST_DIR / "embeddings-archive.zip")

        with zipfile.ZipFile(archive_path, 'w') as archive:
            file_path = 'embedding.3d.vec'
            with archive.open(file_path, 'w') as embeddings_file:
                embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
                embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

            with archive.open('dummy.vec', 'w') as dummy_file:
                dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))

        embeddings_file_uri = format_embeddings_file_uri(
            archive_path, file_path)
        vocab = Vocabulary.from_instances(
            self.dataset,
            min_count={'tokens': 4},
            pretrained_files={'tokens': embeddings_file_uri},
            only_include_pretrained_words=True)

        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(
            self.dataset,
            pretrained_files={'tokens': embeddings_file_uri},
            only_include_pretrained_words=True)
        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words
Exemplo n.º 5
0
    def test_from_instances_exclusive_embeddings_file_inside_archive(self):
        """ Just for ensuring there are no problems when reading pretrained tokens from an archive """
        # Read embeddings file from archive
        archive_path = str(self.TEST_DIR / "embeddings-archive.zip")

        with zipfile.ZipFile(archive_path, 'w') as archive:
            file_path = 'embedding.3d.vec'
            with archive.open(file_path, 'w') as embeddings_file:
                embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
                embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

            with archive.open('dummy.vec', 'w') as dummy_file:
                dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))

        embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
        vocab = Vocabulary.from_instances(self.dataset,
                                          min_count={'tokens': 4},
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)

        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset,
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)
        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words
Exemplo n.º 6
0
    def test_from_dataset_respects_max_vocab_size_single_int(self):
        max_vocab_size = 1
        vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size)
        words = vocab.get_index_to_token_vocabulary().values()
        # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default
        assert len(words) == max_vocab_size + 2

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert len(words) == 5
Exemplo n.º 7
0
    def test_from_dataset_respects_max_vocab_size_single_int(self):
        max_vocab_size = 1
        vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size)
        words = vocab.get_index_to_token_vocabulary().values()
        # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default
        assert len(words) == max_vocab_size + 2

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert len(words) == 5
Exemplo n.º 8
0
    def test_from_dataset_respects_min_count(self):
        vocab = Vocabulary.from_instances(self.dataset, min_count={"tokens": 4})
        words = vocab.get_index_to_token_vocabulary().values()
        assert "a" in words
        assert "b" not in words
        assert "c" not in words

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert "a" in words
        assert "b" in words
        assert "c" in words
Exemplo n.º 9
0
    def test_from_dataset_respects_min_count(self):
        vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4})
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
Exemplo n.º 10
0
    def test_from_dataset_respects_min_count(self):
        vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4})
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
    def forward_on_instances(self,
                             instances: List[Instance],
                             cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        model_input = {}
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        if self._pointer_gen:
            model_input.update({'raw':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)})
            #extend
            extend_vocab = Vocabulary.from_instances(dataset.instances)
            self.vocab.extend_from(extend_vocab)
            dataset.index_instances(self.vocab)
            model_input.update({'extended':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)})
        else:
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)
        #input
        model_input.update({'instances':instances})
        model_input.update({'predict':True})
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Exemplo n.º 12
0
def prepare1():
    """
    First part of preparing data for training
    :return: biLSTM model object, biLSTM vocabulary, data for training, data for validation, cuda biLSTM object,
             biLSTM reader object
    """
    reader = PosDatasetReader()
    train_dataset = reader.read(train_path)
    validation_dataset = reader.read(validation_path)

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    EMBEDDING_DIM = 200
    HIDDEN_DIM = 200

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True))

    model = LstmTagger(word_embeddings, lstm, vocab)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    return model, vocab, train_dataset, validation_dataset, cuda_device, reader
Exemplo n.º 13
0
def main():
    reader = LinzenDatasetReader(append_null=False)
    train_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.train")
    validation_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.val")
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    model = StackRNNAgreementPredictor(vocab,
                                       rnn_dim=100,
                                       rnn_cell_type=torch.nn.GRUCell)
    # model = SimpleRNNAgreementPredictor(
    #     vocab, rnn_dim=18, rnn_type=torch.nn.GRU)

    optimizer = torch.optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=16,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=5)
    trainer.train()

    with open("/tmp/model.th", "wb") as fh:
        torch.save(model.state_dict(), fh)
    vocab.save_to_files("/tmp/vocabulary")
Exemplo n.º 14
0
    def test_read_from_file(self):
        MAX_LEN = 100
        OFFSET_INDICES_HEAD_NAME = 'offset_indices_head'
        OFFSET_INDICES_TAIL_NAME = 'offset_indices_tail'

        reader = FewRelDatasetReader(max_len=MAX_LEN)
        instances = ensure_list(reader.read("tests/fixtures/fewrel.json"))

        vocab = Vocabulary.from_instances(instances)

        fields = instances[0].fields
        tokens = fields['text'].tokens

        head_offsets = [-16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2]
        offset_indices_head = {
            OFFSET_INDICES_HEAD_NAME: [o + MAX_LEN for o in head_offsets]
        }

        tail_offsets = [-13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 0, 1, 2, 3, 4]
        offset_indices_tail = {
            OFFSET_INDICES_TAIL_NAME: [o + MAX_LEN for o in tail_offsets]
        }

        token_indexer_head = OffsetTokenIndexer(token_attribute='offset_head')
        token_indexer_tail = OffsetTokenIndexer(token_attribute='offset_tail')

        assert offset_indices_head == token_indexer_head.tokens_to_indices(
            tokens, vocab, OFFSET_INDICES_HEAD_NAME)
        
        assert offset_indices_tail == token_indexer_tail.tokens_to_indices(
            tokens, vocab, OFFSET_INDICES_TAIL_NAME)
Exemplo n.º 15
0
def evaluate(model: Model, reader: readers.BaseReader,
             test_data: List[Instance]) -> None:
    visualise_model(model)

    vocab = Vocabulary.from_instances(test_data)
    iterator = BucketIterator(batch_size=ARGS.BATCH_SIZE,
                              sorting_keys=reader.keys)
    # Our data should be indexed using the vocabulary we learned.
    iterator.index_with(vocab)

    data_types = split_list(test_data)
    results: Dict[str, Tuple[int, float]] = {}

    model.eval()

    print()
    print('#' * 5, 'PER TYPE EVALUATION', '#' * 5)
    for qtype, data in data_types.items():
        num_items = len(data)
        print(f'Type: {qtype} ({num_items})')

        metrics = allen_eval(model, data, iterator, ARGS.CUDA_DEVICE, "")
        print()

        accuracy = metrics['accuracy']
        results[qtype] = (num_items, accuracy)
Exemplo n.º 16
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if 'dataset_reader' in params:
        reader = DatasetReader.from_params(params.pop('dataset_reader'))
    else:
        raise RuntimeError('`dataset_reader` section is required')

    all_instances = []
    if 'train_data_path' in params:
        print('Reading the training data...')
        train_data = reader.read(params.pop('train_data_path'))
        all_instances.extend(train_data)
    else:
        raise RuntimeError('`train_data_path` section is required')

    validation_data = None
    if 'validation_data_path' in params:
        print('Reading the validation data...')
        validation_data = reader.read(params.pop('validation_data_path'))
        all_instances.extend(validation_data)

    print('Building the vocabulary...')
    vocab = Vocabulary.from_instances(all_instances)

    model = None
    iterator = None
    if 'model' not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print('Showing the first 10 instances:')
        for inst in all_instances[:10]:
            print(inst)
    else:
        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        loader_params = deepcopy(params.pop("data_loader"))
        train_data_loader = DataLoader.from_params(dataset=train_data,
                                                   params=loader_params)
        dev_data_loader = DataLoader.from_params(dataset=validation_data,
                                                 params=loader_params)
        train_data.index_with(vocab)

        # set up a temporary, empty directory for serialization
        with tempfile.TemporaryDirectory() as serialization_dir:
            trainer = Trainer.from_params(
                model=model,
                serialization_dir=serialization_dir,
                data_loader=train_data_loader,
                validation_data_loader=dev_data_loader,
                params=params.pop('trainer'))
            trainer.train()

    return {
        'params': params_copy,
        'dataset_reader': reader,
        'vocab': vocab,
        'iterator': iterator,
        'model': model
    }
def main(train_file_path, val_file_path, vocab_dir, max_vocab_size, min_frq,
         additional):
    logger = logging.getLogger(__name__)

    reader = CopyNetSharedDecoderDatasetReader("tokens")

    logger.info("Reading train file")
    train = reader.read(train_file_path)
    logger.info("Reading val file")
    val = reader.read(val_file_path)

    added_data = []
    for data in additional:
        logger.info("Adding additional data from {}".format(data))
        added_data.append(reader.read(data))

    if added_data:
        added_data = functools.reduce(lambda a, b: a + b, added_data)

    logger.info("Building vocabulary")
    logger.info("Minimal token frequency: {}".format(min_frq))
    logger.info("Max vocab size: {}".format(max_vocab_size))
    vocab = Vocabulary.from_instances(train + val + added_data,
                                      min_count={'tokens': min_frq},
                                      max_vocab_size=max_vocab_size)
    vocab.add_token_to_namespace('@COPY@', namespace='tokens')
    vocab.add_token_to_namespace('@BLANKED@', namespace='tokens')
    vocab.save_to_files(vocab_dir)
Exemplo n.º 18
0
    def test_read(self, lazy):
        reader = GLUESST2DatasetReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={'bert': PretrainedBertIndexer(
                pretrained_model=self.BERT_VOCAB_PATH)},
            skip_label_indexing=False
        )
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'dev.tsv'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens']]
        label = example.fields['label'].label
        print(label)
        print(tokens)
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)
        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([vocab.get_token_from_index(i, "bert")
               for i in tokens["bert"].tolist()[0]])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
Exemplo n.º 19
0
def main():
    reader = LanguageModelingReader()
    train_dataset = reader.read('data/mt/sentences.eng.10k.txt')

    # for inst in train_dataset:
    #     print(inst)

    vocab = Vocabulary.from_instances(train_dataset, min_count={'tokens': 5})

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("input_tokens", "num_tokens")])

    iterator.index_with(vocab)

    model = RNNLanguageModel(vocab, cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      patience=10,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)

    trainer.train()

    print(model.generate())
    print(model.generate())
    print(model.generate())
    print(model.generate())
    print(model.generate())
Exemplo n.º 20
0
    def test_batches(self):
        readers = {
                "a": PlainTextReader(),
                "b": PlainTextReader(),
                "c": PlainTextReader()
        }

        reader = InterleavingDatasetReader(readers)
        data_dir = self.FIXTURES_ROOT / "data"

        file_path = f"""{{
            "a": "{data_dir / 'babi.txt'}",
            "b": "{data_dir / 'conll2000.txt'}",
            "c": "{data_dir / 'conll2003.txt'}"
        }}"""

        instances = list(reader.read(file_path))
        vocab = Vocabulary.from_instances(instances)

        actual_instance_type_counts = Counter(instance.fields["dataset"].metadata
                                              for instance in instances)

        iterator = HomogeneousBatchIterator(batch_size=3)
        iterator.index_with(vocab)

        observed_instance_type_counts = Counter()

        for batch in iterator(instances, num_epochs=1, shuffle=True):
            # batch should be homogeneous
            instance_types = set(batch["dataset"])
            assert len(instance_types) == 1

            observed_instance_type_counts.update(batch["dataset"])

        assert observed_instance_type_counts == actual_instance_type_counts
 def __init__(self):
     self.reader = LinzenDatasetReader()
     self.dataset = self.reader.read('data/rnn_agr_simple/numpred.train')
     self.vocab = Vocabulary.from_instances(self.dataset)
     self.dataset_list = list(iter(self.dataset))
     self.instance = None
     self._label = None
Exemplo n.º 22
0
 def __init__(self, filename):
     self.reader = LinzenDatasetReader()
     self.dataset = self.reader.read(filename)
     self.vocab = Vocabulary.from_instances(self.dataset)
     self.dataset_list = list(iter(self.dataset))
     self.instance = None
     self._label = None
Exemplo n.º 23
0
def main():
    reader = TatoebaSentenceReader()
    train_set = reader.read('data/mt/sentences.top10langs.train.tsv')
    dev_set = reader.read('data/mt/sentences.top10langs.dev.tsv')

    vocab = Vocabulary.from_instances(train_set,
                                      min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    positive_label = vocab.get_token_index('eng', namespace='labels')
    model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_set,
                      validation_dataset=dev_set,
                      num_epochs=3)

    trainer.train()
    def test_skip_smaller_batches(self):
        readers = {
            "a": PlainTextReader(),
            "b": PlainTextReader(),
            "c": PlainTextReader()
        }

        reader = InterleavingDatasetReader(readers)
        data_dir = self.FIXTURES_ROOT / "data"

        file_path = f"""{{
            "a": "{data_dir / 'babi.txt'}",
            "b": "{data_dir / 'conll2000.txt'}",
            "c": "{data_dir / 'conll2003.txt'}"
        }}"""

        instances = list(reader.read(file_path))
        vocab = Vocabulary.from_instances(instances)

        iterator = HomogeneousBatchIterator(batch_size=3,
                                            skip_smaller_batches=True)
        iterator.index_with(vocab)

        for batch in iterator(instances, num_epochs=1, shuffle=True):
            # every batch should have length 3 (batch size)
            assert len(batch["dataset"]) == 3
Exemplo n.º 25
0
    def from_partial_objects(
        cls,
        serialization_dir: str,
        train_dataset_readers: Dict[str, DatasetReader],
        train_file_paths: Dict[str, str],
        model: Lazy[Model],
        iterator: DataIterator,
        mingler: DatasetMingler,
        optimizer: Lazy[Optimizer],
        num_epochs: int = 10,
    ) -> "MultiTaskTrainer":

        datasets = {
            name: reader.read(train_file_paths[name])
            for name, reader in train_dataset_readers.items()
        }

        instances = (instance for dataset in datasets.values()
                     for instance in dataset)
        vocab = Vocabulary.from_instances(instances=instances)
        model = model.construct(vocab=vocab)
        iterator.index_with(vocab)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer_ = optimizer.construct(model_parameters=parameters)

        return MultiTaskTrainer(model, serialization_dir, iterator, mingler,
                                optimizer_, datasets, num_epochs)
Exemplo n.º 26
0
    def setUp(self) -> None:
        super().setUp()

        # use SequenceTaggingDatasetReader as the base reader
        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'


        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'identical_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.all_distinct_path = str(self.TEST_DIR / 'all_distinct.tsv')
        with open(self.all_distinct_path, 'w') as all_distinct:
            for i in range(100):
                file_path = self.TEST_DIR / f'distinct_{i}.tsv'
                line = f"This###DT\tis###VBZ\tsentence###NN\t{i}###CD\t.###.\n"
                with open(file_path, 'w') as f:
                    f.write(line)
                all_distinct.write(line)

        self.identical_files_glob = str(self.TEST_DIR / 'identical_*.tsv')
        self.distinct_files_glob = str(self.TEST_DIR / 'distinct_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
Exemplo n.º 27
0
def train_model(parameters, name):
    token_indexer = {
        "tokens": ELMoTokenCharactersIndexer()
    } if parameters['use_elmo'] else None
    reader = SSJ500KReader(
        token_indexer) if parameters["dataset"] == "ssj" else SentiCorefReader(
            token_indexer)
    train_dataset = reader.read("train")
    validation_dataset = reader.read("test")
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
    # vocab = Vocabulary() if parameters['use_elmo'] else Vocabulary.from_instances(train_dataset + validation_dataset)
    model = get_model(vocab, parameters)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    optimizer = optim.Adam(model.parameters(),
                           lr=parameters['lr'],
                           weight_decay=parameters['weight_decay'])
    iterator = BucketIterator(batch_size=parameters['batch_size'],
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=parameters['patience'],
                      num_epochs=parameters['num_epochs'],
                      cuda_device=cuda_device)
    trainer.train()
    metrics = evaluate(model, validation_dataset, iterator, cuda_device, None)
    save_model_and_vocab(model, vocab, metrics, parameters, fname=name)
Exemplo n.º 28
0
    def __init__(self,
                 vocab_size: int,
                 max_len: int,
                 batch_size: int,
                 log_dir: str = '',
                 mode: str = 'train',
                 scale: int = 10000):

        data_path = os.path.expanduser('~/data/wiki2bio/')

        train_path = os.path.join(data_path,
                                  'train.t2p.{0}.jsonl'.format(scale))
        dev_path = os.path.join(data_path, 'valid.t2p.jsonl')
        test_path = os.path.join(data_path, 'test.t2p.jsonl')

        vocab_dir = os.path.join(data_path,
                                 'dicts-{0}-t2p-{1}'.format(vocab_size, scale))
        self.metrics = '+f1'
        self.data_path = data_path
        self.mode = mode

        self.train_dataset = Table2PivotDataset(path=train_path,
                                                max_len=max_len)
        self.test_dataset = Table2PivotDataset(path=test_path, max_len=max_len)
        self.dev_dataset = Table2PivotDataset(path=dev_path, max_len=max_len)

        if os.path.exists(vocab_dir):
            vocab = Vocabulary.from_files(vocab_dir)
        else:
            vocab = Vocabulary.from_instances(instances=self.train_dataset,
                                              max_vocab_size=vocab_size)
            vocab.save_to_files(vocab_dir)

        collate_fn = basic_collate(vocab=vocab)

        self.train_loader = torch.utils.data.DataLoader(
            dataset=self.train_dataset,
            batch_size=batch_size,
            collate_fn=collate_fn,
            shuffle=True)
        self.dev_loader = torch.utils.data.DataLoader(dataset=self.dev_dataset,
                                                      batch_size=128,
                                                      collate_fn=collate_fn,
                                                      shuffle=False)
        self.test_loader = torch.utils.data.DataLoader(
            dataset=self.test_dataset,
            batch_size=128,
            collate_fn=collate_fn,
            shuffle=False)
        self.vocab = vocab
        self.scale = scale

        if not log_dir:
            self.log_dir = Path(data_path) / 'log' / time.strftime(
                "%Y-%m-%dT%H_%M_%S")
        else:
            self.log_dir = Path(data_path) / 'log' / log_dir
        self.log_dir.mkdir(parents=True, exist_ok=True)
        self.log_dir = str(self.log_dir)
Exemplo n.º 29
0
 def _get_expected_vocab(dataset, namespace, model_name):
     vocab_from_instances = Vocabulary.from_instances(dataset)
     instance_tokens = set(
         vocab_from_instances._token_to_index[namespace].keys())
     transformer_tokens = set(
         Vocabulary.from_pretrained_transformer(
             model_name, namespace)._token_to_index[namespace].keys())
     return instance_tokens.union(transformer_tokens)
Exemplo n.º 30
0
 def build_allennlp_vocab(self, splits=None):
     if splits is None:
         splits = [TRAIN, VALIDATION]
     iterator = [instance
                 for instance in self.read(self.data_file_paths[split])
                 for split in splits]
     vocab = Vocabulary.from_instances(iterator)
     return vocab
Exemplo n.º 31
0
def train(model_args):
    model_name = model_args.serialization_dir
    checkpoint_dir = model_args.store_folder
    learning_rate = model_args.learning_rate
    rl_basic = model_args.rl_basic
    pretrain_folder = ''

    if checkpoint_dir == 'pretrain':
        is_pretrain = True
    else:
        # check if rl_basic is specified
        pretrain_folder = os.path.join('pretrain', rl_basic)
        if not os.path.exists(pretrain_folder):
            raise FileNotFoundError(f'Can not find the pretrained model {pretrain_folder}!')
        is_pretrain = False

    reader = construct_reader(is_pretrain=is_pretrain)

    train_dataset = reader.read("data_processed\\train.jsonl")
    test_dataset = reader.read("data_processed\\test.jsonl")

    # build vocabulary
    vocab = Vocabulary.from_instances(train_dataset + test_dataset)

    # build model and move it into cuda
    model = construct_model(vocab, model_args)
    model.cuda()

    # allocate
    optimizer = optim.Adam(model.parameters(), weight_decay=1e-5, lr=learning_rate)
    scheduler = construct_learning_scheduler(optimizer)

    iterator = BucketIterator(batch_size=2, sorting_keys=[("prev_tokens", "num_tokens")])
    iterator.index_with(vocab)

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # not recover from previous state, we should load the pretrain model as default.
    if not is_pretrain and not os.path.exists(os.path.join(checkpoint_dir, model_name, "best.th")):
        model_state = torch.load(os.path.join(pretrain_folder, "best.th"))
        model.load_state_dict(model_state)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=test_dataset,
                      learning_rate_scheduler=scheduler,
                      patience=model_args.patience,
                      validation_metric="+{}".format(model_args.validation_metric),
                      num_epochs=model_args.epoch,
                      serialization_dir=os.path.join(checkpoint_dir, model_name),
                      cuda_device=0,
                      should_log_learning_rate=True)

    trainer.train()
    return model_name
Exemplo n.º 32
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()
Exemplo n.º 33
0
 def test_vocab_size_correct_scierc(self):
     vocab = Vocabulary.from_instances(self.instances_scierc)
     # There are 4 unique NER labels and 6 relation labels in the text fixture doc. For the ner
     # labels, there is an extra category for the null label. For the relation labels, there
     # isn't. This is due to the way their respective `Field`s represent labels.
     assert vocab.get_vocab_size("ner_labels") == 5
     assert vocab.get_vocab_size("relation_labels") == 6
     # For numeric labels, vocab size is 0.
     assert vocab.get_vocab_size("coref_labels") == 0
Exemplo n.º 34
0
 def test_vocab_from_instances_namespaces(self):
     reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg'])
     instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt'))
     # check that we didn't clobber the labels namespace
     vocab = Vocabulary.from_instances(instances)
     self.assertSetEqual(
             set(vocab._token_to_index.keys()), # pylint: disable=protected-access
             {'tokens', 'labels', 'modified_pos_tags', 'original_pos_tags',
              'predicate_arg_tags'}
     )
Exemplo n.º 35
0
    def test_from_dataset_respects_inclusive_embedding_file(self):
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
            embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

        vocab = Vocabulary.from_instances(self.dataset,
                                          min_count={'tokens': 4},
                                          pretrained_files={'tokens': embeddings_filename},
                                          only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset,
                                          pretrained_files={'tokens': embeddings_filename},
                                          only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
Exemplo n.º 36
0
    def setUp(self):
        super().setUp()

        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'

        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
Exemplo n.º 37
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
Exemplo n.º 38
0
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}


#### Now that we've implemented a <code>DatasetReader</code> and <code>Model</code>, we're ready to train. We first need an instance of our dataset reader.
reader = PosDatasetReader()
#### Which we can use to read in the training data and validation data. Here we read them in from a URL, but you could read them in from local files if your data was local. We use <code>cached_path</code> to cache the files locally (and to hand <code>reader.read</code> the path to the local cached version.)
train_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/training.txt'))
validation_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/validation.txt'))

#### Once we've read in the datasets, we use them to create our <code>Vocabulary</code> (that is, the mapping[s] from tokens / labels to ids).
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

#### Now we need to construct the model. We'll choose a size for our embedding layer and for the hidden layer of our LSTM.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

#### For embedding the tokens we'll just use the <code>BasicTextFieldEmbedder</code> which takes a mapping from index names to embeddings. If you go back to where we defined our <code>DatasetReader</code>, the default parameters included a single index called "tokens", so our mapping just needs an embedding corresponding to that index. We use the <code>Vocabulary</code> to find how many embeddings we need and our <code>EMBEDDING_DIM</code> parameter to specify the output dimension. It's also possible to start with pre-trained embeddings (for example, GloVe vectors), but there's no need to do that on this tiny toy dataset.
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
#### We next need to specify the sequence encoder. The need for <code>PytorchSeq2SeqWrapper</code> here is slightly unfortunate (and if you use <a href = "https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files">configuration files</a> you won't need to worry about it) but here it's required to add some extra functionality (and a cleaner interface) to the built-in PyTorch module. In AllenNLP we do everything batch first, so we specify that as well.
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

#### Finally, we can instantiate the model.
model = LstmTagger(word_embeddings, lstm, vocab)