示例#1
0
def main(args):
    params = Params.from_file(args.config_path)
    stdout_handler = prepare_global_logging(args.output_dir, False)
    prepare_environment(params)

    reader = DatasetReader.from_params(params["dataset_reader"])
    train_dataset = reader.read(params.pop("train_data_path", None))
    valid_dataset = reader.read(params.pop("validation_data_path", None))
    test_data_path = params.pop("test_data_path", None)
    if test_data_path:
        test_dataset = reader.read(test_data_path)
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset +
                                          test_dataset)
    else:
        test_dataset = None
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

    model_params = params.pop("model", None)
    model = Model.from_params(model_params.duplicate(), vocab=vocab)
    vocab.save_to_files(os.path.join(args.output_dir, "vocabulary"))
    # copy config file
    with open(args.config_path, "r", encoding="utf-8") as f_in:
        with open(os.path.join(args.output_dir, "config.json"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f_in.read())

    iterator = DataIterator.from_params(params.pop("iterator", None))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer", None)
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=args.output_dir,
                                  iterator=iterator,
                                  train_data=train_dataset,
                                  validation_data=valid_dataset,
                                  params=trainer_params.duplicate())
    trainer.train()

    # evaluate on the test set
    if test_dataset:
        logging.info("Evaluating on the test set")
        import torch  # import here to ensure the republication of the experiment
        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "best.th")))
        test_metrics = evaluate(model,
                                test_dataset,
                                iterator,
                                cuda_device=trainer_params.pop(
                                    "cuda_device", 0),
                                batch_weight_key=None)
        logging.info(f"Metrics on the test set: {test_metrics}")
        with open(os.path.join(args.output_dir, "test_metrics.txt"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f"Metrics on the test set: {test_metrics}")

    cleanup_global_logging(stdout_handler)
示例#2
0
def read_squad_word_char(file_path):
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace="token_ids"),
        "chars": TokenCharactersIndexer(namespace="token_chars")
    }
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print(len(word2idx))
    print(len(char2idx))
    print(char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print(padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print(tensor_dict['passage']['tokens'].shape)
    print(tensor_dict['passage']['chars'].shape)
    print(tensor_dict['question']['tokens'].shape)
    print(tensor_dict['question']['chars'].shape)
    print(tensor_dict['span_start'].shape)
    print(tensor_dict['span_end'].shape)
示例#3
0
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read('tests/fixtures/data/squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab, params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
示例#4
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if 'dataset_reader' in params:
        reader = DatasetReader.from_params(params.pop('dataset_reader'))
    else:
        raise RuntimeError('`dataset_reader` section is required')

    all_instances = []
    if 'train_data_path' in params:
        print('Reading the training data...')
        train_data = reader.read(params.pop('train_data_path'))
        all_instances.extend(train_data)
    else:
        raise RuntimeError('`train_data_path` section is required')

    validation_data = None
    if 'validation_data_path' in params:
        print('Reading the validation data...')
        validation_data = reader.read(params.pop('validation_data_path'))
        all_instances.extend(validation_data)

    print('Building the vocabulary...')
    vocab = Vocabulary.from_instances(all_instances)

    model = None
    iterator = None
    if 'model' not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print('Showing the first 10 instances:')
        for inst in all_instances[:10]:
            print(inst)
    else:
        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        loader_params = deepcopy(params.pop("data_loader"))
        train_data_loader = DataLoader.from_params(dataset=train_data,
                                                   params=loader_params)
        dev_data_loader = DataLoader.from_params(dataset=validation_data,
                                                 params=loader_params)
        iterator.index_with(vocab)

        # set up a temporary, empty directory for serialization
        with tempfile.TemporaryDirectory() as serialization_dir:
            trainer = Trainer.from_params(
                model=model,
                serialization_dir=serialization_dir,
                data_loader=train_data_loader,
                validation_data_loader=dev_data_loader,
                params=params.pop('trainer'))
            trainer.train()

    return {
        'params': params_copy,
        'dataset_reader': reader,
        'vocab': vocab,
        'iterator': iterator,
        'model': model
    }
示例#5
0
    def setUp(self):
        super().setUp()
        # TODO make this a set of dataset readers
        # Classification may be easier in this case. Same dataset reader but with different paths 
        self.instances_list = []
        self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging.tsv'))
        self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging1.tsv'))
        self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging2.tsv'))
        # loop through dataset readers and extend vocab
        combined_vocab = Vocabulary.from_instances(self.instances_list[0])

        for instance in self.instances_list:
            combined_vocab.extend_from_instances(Params({}), instances=instance)
        self.vocab = combined_vocab
        # Figure out params TODO 
        self.model_params = Params({
                "text_field_embedder": {
                        "token_embedders": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                        }
                                }
                        },
                "encoder": {
                        "type": "lstm",
                        "input_size": 5,
                        "hidden_size": 7,
                        "num_layers": 2
                        }
                })
        self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
        self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
        self.iterator = BasicIterator(batch_size=2)
        self.iterator.index_with(combined_vocab)
示例#6
0
    def test_console_log_callback(self):
        total_instances = 1000
        batch_size = 25

        reader = FakeDatasetReader(total_instances, batch_size)
        data_loader = SimpleDataLoader.from_dataset_reader(
            reader, "fake_path", batch_size=batch_size)
        instances = list(data_loader.iter_instances())
        vocab = Vocabulary.from_instances(instances)
        data_loader.index_with(vocab)
        model = FakeModel(vocab)
        optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9)

        trainer = GradientDescentTrainer(
            model,
            optimizer,
            data_loader,
            num_epochs=3,
            serialization_dir=self.TEST_DIR,
            callbacks=[
                ConsoleLoggerCallback.from_params(
                    Params({"should_log_inputs": True}),
                    serialization_dir=self.TEST_DIR,
                )
            ],
        )
        trainer.train()
示例#7
0
def get_pico_label_vocab():
    labels = ['O', 'I-PAR', 'I-OUT', 'I-INT']
    labels = [
        Instance({'label': LabelField(l, label_namespace='labels')})
        for l in labels
    ]
    return Vocabulary.from_instances(labels)
 def init_vocab(self,file_path):
     # datareader = IMDBDatasetReader(1)
     if(file_path and os.path.isfile(file_path)):
         instance = self.datareader.read(file_path)
         self.vocab = Vocabulary.from_instances(instance)
     else:
         self.vocab = None
示例#9
0
    def test_trainer_can_log_batch_inputs(self):
        total_instances = 1000
        batch_size = 25

        reader = FakeDatasetReader(total_instances, batch_size)
        data_loader = SimpleDataLoader.from_dataset_reader(
            reader, "fake_path", batch_size=batch_size)
        instances = list(data_loader.iter_instances())
        vocab = Vocabulary.from_instances(instances)
        data_loader.index_with(vocab)
        model = FakeModel(vocab)
        optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9)

        trainer = GradientDescentTrainer(
            model,
            optimizer,
            data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            callbacks=[
                TensorBoardCallback(
                    serialization_dir=self.TEST_DIR,
                    distribution_interval=2,
                )
            ],
        )
        trainer.train()
示例#10
0
def build_vocab(instances: Iterable[Instance] = None,
                from_transformer: bool = False) -> Vocabulary:
    """
    Build the Vocabulary object from the instances only,
    or from the pretrained transformer, based on boolean flag

    :param instances: Iterable of allennlp instances.
    :param from_transformer: Whether to initialize vocab from
                             pretrained transformer, or from
                             instances directly.
    :return Vocabulary: The Vocabulary object.
    """
    # log.debug("Building the vocabulary.")

    if from_transformer:
        vocab = Vocabulary.from_pretrained_transformer(
            model_name="bert-base-uncased")

    elif instances:
        vocab = Vocabulary.from_instances(instances)

    else:
        print("No instances to create vocab with, and pretrained"
              " transformer isn't being used.")
        raise UnskippableSituationError()

    return vocab
示例#11
0
def test():
    reader = NameReader()
    instances = reader.read('./data/first_names.all.txt')
    instances = ensure_list(instances)

    # expected few names
    fields = instances[0].fields
    logger.info(fields)
    tokens = [t.text for t in fields['tokens']]
    logger.info(tokens)

    fields = instances[1].fields
    tokens = [t.text for t in fields['tokens']]
    logger.info(tokens)

    instances[0].fields

    # Now we need to create a small vocabulary from our sentence- Note that we have used
    # only character indexers, we we call Vocabulary.from_instances, this will create
    # vocabulary which correspond to the namespaces of each token indexer in our Text Field's

    # build vocabulary

    vocab = Vocabulary.from_instances(instances)

    print("This is the token ids vocabulary we created \n")
    print(vocab.get_index_to_token_vocabulary('character_vocab'))

    for instance in instances:
        instance.index_fields(vocab)

    # get the tensor dict
    logger.info(instances[0].as_tensor_dict())
示例#12
0
 def setup_method(self):
     super().setup_method()
     self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     self.reader = SequenceTaggingDatasetReader()
     self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2)
     self.data_loader_lazy = MultiProcessDataLoader(
         self.reader, self.data_path, batch_size=2, max_instances_in_memory=10
     )
     self.instances = list(self.data_loader.iter_instances())
     self.vocab = Vocabulary.from_instances(self.instances)
     self.data_loader.index_with(self.vocab)
     self.data_loader_lazy.index_with(self.vocab)
     self.model_params = Params(
         {
             "text_field_embedder": {
                 "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
             },
             "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
         }
     )
     self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
     self.validation_data_loader = MultiProcessDataLoader(
         self.reader, self.data_path, batch_size=2
     )
     self.validation_data_loader.index_with(self.vocab)
示例#13
0
def create_save_vocab(file_path, target_dir, word_min_count, char_min_count):
    namespace_word = "word2idx"
    namespace_char = "char2idx"
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace=namespace_word),
        "chars": TokenCharactersIndexer(namespace=namespace_char)
    }
    min_count = {
        namespace_word: word_min_count,
        namespace_char: char_min_count
    }

    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances, min_count=min_count)
    word_cnt = vocab.get_vocab_size(namespace_word)
    char_cnt = vocab.get_vocab_size(namespace_char)
    vocab.save_to_files(target_dir)
    print("save word2idx={}, char2idx={} to {}".format(word_cnt, char_cnt,
                                                       target_dir))
    word2idx = vocab.get_index_to_token_vocabulary(namespace_word)
    char2idx = vocab.get_index_to_token_vocabulary(namespace_char)
    print(char2idx)
    vocab = Vocabulary.from_files(target_dir)
    char2idx = vocab.get_index_to_token_vocabulary(namespace_char)
    print(char2idx)
    return
示例#14
0
    def run(self, reader: DatasetReader,
            splits: Dict[str, str]) -> DatasetDict:  # type: ignore
        """
        * `reader` specifies the old-school dataset reader to use.
        * `splits` maps the names of the splits to the filenames to use for the
          dataset reader. It might look like this:
          ```
          {
              "train": "/path/to/train.json",
              "validation": "/path/to/validation.json"
          }
          ```
        """
        instances_map: Dict[str, Sequence[Instance]] = {
            split_name: list(tqdm(reader.read(path), desc=f"Reading {path}"))
            for split_name, path in splits.items()
        }
        vocab = Vocabulary.from_instances(
            itertools.chain(*instances_map.values()))

        # index all the instances with the vocab
        for split_name, instances in instances_map.items():
            for instance in tqdm(instances, desc=f"Indexing {split_name}"):
                instance.index_fields(vocab)

        return DatasetDict(splits=instances_map, vocab=vocab)
示例#15
0
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read(self.FIXTURES_ROOT / 'data' /
                                     'squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_embedders'][
            'token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab=vocab, params=params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
示例#16
0
    def test_from_params(self):

        for config_path in CONFIG_DIR.glob("*.jsonnet"):
            params = Params.from_file(str(config_path),
                                      ext_vars={
                                          "TRAIN_DATA_PATH": "",
                                          "VALID_DATA_PATH": ""
                                      })

            data_reader_params = params["dataset_reader"]
            data_reader_params.pop("type")

            reader = openvaccine.CovidReader.from_params(data_reader_params)
            instances = reader.read(PROJECT_ROOT / "data" / "sample.jsonl")
            vocab = Vocabulary.from_instances(instances)

            batch = Batch(instances)
            batch.index_instances(vocab)

            try:
                model = Model.from_params(params=params["model"], vocab=vocab)
            except Exception as e:
                raise AssertionError(
                    f"unable to load params from {config_path}, because {e}")

            output_dict = model(**batch.as_tensor_dict())

            assert set(output_dict.keys()) == {
                "logits",
                "seq_id",
                "loss",
            }

            assert len(output_dict["logits"].shape) == 3
            assert isinstance(output_dict["seq_id"][0], str)
示例#17
0
    def test_batch_predictions_are_consistent(self):
        # The same issue as the bidaf test case.
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder. So, we'll remove the CNN encoder entirely from the model for this test.
        # Save some state.

        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params["dataset_reader"])
        reader._token_indexers = {"tokens": reader._token_indexers["tokens"]}
        self.instances = reader.read(FIXTURES_ROOT / "data" / "squad.json")
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params["model"]["text_field_embedder"]["token_embedders"][
            "token_characters"]
        params["model"]["phrase_layer"]["num_convs_per_block"] = 0
        params["model"]["modeling_layer"]["num_convs_per_block"] = 0
        self.model = Model.from_params(vocab=vocab, params=params["model"])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
示例#18
0
    def test_create_models_from_allennlp_configs(self, config_path):

        params = Params.from_file(
            str(config_path),
            ext_vars={
                "CLF_TRAIN_DATA_PATH": "",
                "CLF_VALID_DATA_PATH": "",
                "DISCRETIZER_PATH": str(DISCRETIZER_PATH),
                "VOCAB_PATH": str(VOCAB_PATH),
            },
        )

        reader = DatasetReader.from_params(params["dataset_reader"])

        instances = reader.read(DATA_PATH)
        vocab = Vocabulary.from_instances(instances)
        num_labels = vocab.get_vocab_size(namespace="labels")

        batch = Batch(instances)
        batch.index_instances(vocab)

        try:
            model = Model.from_params(params=params["model"], vocab=vocab)
        except Exception as e:
            raise AssertionError(f"unable to load params from {config_path}") from e

        output_dict = model(**batch.as_tensor_dict())

        assert "probs" in output_dict
        assert len(output_dict["probs"].shape) == 2
        assert output_dict["probs"].shape[0] == len(instances)
        assert output_dict["probs"].shape[1] == num_labels
示例#19
0
 def setUp(self):
     super().setUp()
     self.instances = SequenceTaggingDatasetReader().read(
         self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
     )
     self.instances_lazy = SequenceTaggingDatasetReader(lazy=True).read(
         self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
     )
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params(
         {
             "text_field_embedder": {
                 "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
             },
             "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
         }
     )
     self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
     self.data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate)
     self.data_loader_lazy = DataLoader(
         self.instances_lazy, batch_size=2, collate_fn=allennlp_collate
     )
     self.validation_data_loader = DataLoader(
         self.instances, batch_size=2, collate_fn=allennlp_collate
     )
     self.instances.index_with(vocab)
     self.instances_lazy.index_with(vocab)
示例#20
0
文件: toy.py 项目: nilesh-c/kgqa
    def setUp(self):
        self.reader = ToyReader()
        self.train_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/train/toy_train.txt")
        self.dev_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/dev/toy_dev.txt")
        self.vocab = Vocabulary.from_instances(self.train_instances + self.dev_instances)

        token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size('tokens') + 2,
                                    embedding_dim=256, padding_index=0)

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(nn.LSTM(input_size=word_embeddings.get_output_dim(),
                                                num_layers=2,
                                                hidden_size=256,
                                                bidirectional=True,
                                                dropout=0.4,
                                                batch_first=True))

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=word_embeddings,
                                   encoder=encoder,
                                   target_embedding_dim=256,
                                   target_namespace='target_tokens',
                                   attention=DotProductAttention(),
                                   max_decoding_steps=25,
                                   beam_size=5,
                                   use_bleu=True
                                   )

        self.model.cuda(0)
示例#21
0
    def setUp(self):
        super().setUp()

        # A lot of the tests want access to the metric tracker
        # so we add a property that gets it by grabbing it from
        # the relevant callback.
        def metric_tracker(self: CallbackTrainer):
            for callback in self.handler.callbacks():
                if isinstance(callback, TrackMetrics):
                    return callback.metric_tracker
            return None

        setattr(CallbackTrainer, 'metric_tracker', property(metric_tracker))

        self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
        vocab = Vocabulary.from_instances(self.instances)
        self.vocab = vocab
        self.model_params = Params({
                "text_field_embedder": {
                        "token_embedders": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                        }
                                }
                        },
                "encoder": {
                        "type": "lstm",
                        "input_size": 5,
                        "hidden_size": 7,
                        "num_layers": 2
                        }
                })
        self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
        self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
示例#22
0
def get_trainer_from_config(
        config: Params,
        train_instances: List[Instance],
        val_instances: List[Instance],
        device: int,
        serialization_dir: Optional[str] = None) -> Trainer:
    trainer_params = config.pop("trainer")
    trainer_params["cuda_device"] = device
    model_params = config.pop("model")
    vocab_dir = config.pop("vocab_dir", None)
    if vocab_dir is None:
        vocab = Vocabulary.from_instances(train_instances)
    else:
        vocab = Vocabulary.from_files(vocab_dir)
    model = Model.from_params(model_params, vocab=vocab)
    iterator = DataIterator.from_params(config.pop("iterator"))
    trainer_params["num_serialized_models_to_keep"] = 1
    iterator.index_with(vocab)
    trainer = Trainer.from_params(model=model,
                                  iterator=iterator,
                                  train_data=train_instances,
                                  validation_data=val_instances,
                                  serialization_dir=serialization_dir,
                                  params=trainer_params)
    return trainer
示例#23
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(
         self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params({
         "text_field_embedder": {
             "tokens": {
                 "type": "embedding",
                 "embedding_dim": 5
             }
         },
         "encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         }
     })
     self.model = SimpleTagger.from_params(vocab=self.vocab,
                                           params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
     self.iterator.index_with(vocab)
示例#24
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     dataset = SequenceTaggingDatasetReader().read(
         'tests/fixtures/data/sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(dataset)
     self.vocab = vocab
     dataset.index_instances(vocab)
     self.dataset = dataset
     self.model_params = Params({
         "text_field_embedder": {
             "tokens": {
                 "type": "embedding",
                 "embedding_dim": 5
             }
         },
         "stacked_encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         }
     })
     self.model = SimpleTagger.from_params(self.vocab, self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
 def _yield_one_epoch(self, instances: Iterable[Instance], shuffle: bool,
                      cuda_device: int, for_training: bool):
     batches = self._create_batches(instances, shuffle)
     for batch in batches:
         # raw
         batch.index_instances(self.vocab)
         padding_lengths = batch.get_padding_lengths()
         logger.debug("Batch padding lengths: %s", str(padding_lengths))
         logger.debug("Batch size: %d", len(batch.instances))
         forword_input = {
             'raw':
             batch.as_tensor_dict(padding_lengths,
                                  cuda_device=cuda_device,
                                  for_training=for_training)
         }
         # extended
         extend_vocab = Vocabulary.from_instances(batch.instances)
         self.vocab.extend_from(extend_vocab)
         batch.index_instances(self.vocab)
         padding_lengths = batch.get_padding_lengths()
         logger.debug("Batch padding lengths: %s", str(padding_lengths))
         logger.debug("Batch size: %d", len(batch.instances))
         forword_input.update({
             'extended':
             batch.as_tensor_dict(padding_lengths,
                                  cuda_device=cuda_device,
                                  for_training=for_training)
         })
         # instance for metrics
         forword_input.update({'instances': batch.instances})
         yield forword_input
示例#26
0
 def test_from_params(self, data_path: str, sentence_marker_params: Params, ccm_params: Params) -> None:
     reader = DatasetReader.from_params(sentence_marker_params)
     instances = reader.read(data_path)
     vocab = Vocabulary.from_instances(instances)
     ccm_module = ConstrainedConditionalModule.from_params(vocab=vocab, params=ccm_params)
     index = vocab.get_token_index("I-type", "labels")
     assert ccm_module._sentence_penalty_map == (index, 50.)
示例#27
0
    def set_up_model(self, param_file, dataset_file):

        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params["dataset_reader"])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = reader.read(str(dataset_file))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params["model"])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
示例#28
0
def read_squad_allennlp(file_path):
    '''read data, build vocab, batch, padding, to idx
    Args:
        file_path -- raw squad json file
    Returns:
        None
    '''
    token_indexers = {
            "tokens": SingleIdTokenIndexer(namespace="token_ids"),
            "chars": TokenCharactersIndexer(namespace="token_chars")}
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    for instance in instances:
        question = instance.fields['question']
        print (question)
        print (type(question))
        break
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print (len(word2idx))
    print (len(char2idx))
    print (char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print (padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print (tensor_dict['passage']['tokens'].shape)
    print (tensor_dict['passage']['chars'].shape)
    print (tensor_dict['question']['tokens'].shape)
    print (tensor_dict['question']['chars'].shape)
    print (tensor_dict['span_start'].shape)
    print (tensor_dict['span_end'].shape)
示例#29
0
    def test_batch_predictions_are_consistent(self):
        # The same issue as the bidaf test case.
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder. So, we'll remove the CNN encoder entirely from the model for this test.
        # Save some state.
        # pylint: disable=protected-access,attribute-defined-outside-init
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read(self.FIXTURES_ROOT / 'data' /
                                     'squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_embedders'][
            'token_characters']
        params['model']['phrase_layer']['num_convs_per_block'] = 0
        params['model']['modeling_layer']['num_convs_per_block'] = 0
        self.model = Model.from_params(vocab=vocab, params=params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
示例#30
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = list(reader.read(str(dataset_file)))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
示例#31
0
    def build_vocab(self,
                    pipeline: "Pipeline",
                    lazy: bool = False) -> Vocabulary:
        """Build the configured vocabulary

        Parameters
        ----------
        pipeline
            The pipeline used to create the instances from which the vocabulary is built.
        lazy
            If true, instances are lazily loaded from disk, otherwise they are loaded into memory.

        Returns
        -------
        vocab
        """
        vocab = Vocabulary.from_instances(
            instances=(
                instance for dataset in self.datasets
                for instance in dataset.to_instances(pipeline, lazy=lazy)),
            max_vocab_size=self.max_vocab_size,
            min_count=self.min_count,
            pretrained_files=self.pretrained_files,
            only_include_pretrained_words=self.only_include_pretrained_words,
            min_pretrained_embeddings=self.min_pretrained_embeddings,
            tokens_to_add=self.tokens_to_add,
        )

        return vocab
示例#32
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(self.vocab, params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
示例#33
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {'character_ids': indexer,
                               'tokens': indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
示例#34
0
 def setUp(self):
     super(TestOptimizer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.model_params = Params({
             "text_field_embedder": {
                     "tokens": {
                             "type": "embedding",
                             "embedding_dim": 5
                             }
                     },
             "encoder": {
                     "type": "lstm",
                     "input_size": 5,
                     "hidden_size": 7,
                     "num_layers": 2
                     }
             })
     self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
示例#35
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
示例#36
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params({
             "text_field_embedder": {
                     "tokens": {
                             "type": "embedding",
                             "embedding_dim": 5
                             }
                     },
             "encoder": {
                     "type": "lstm",
                     "input_size": 5,
                     "hidden_size": 7,
                     "num_layers": 2
                     }
             })
     self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
     self.iterator.index_with(vocab)