def main(args): params = Params.from_file(args.config_path) stdout_handler = prepare_global_logging(args.output_dir, False) prepare_environment(params) reader = DatasetReader.from_params(params["dataset_reader"]) train_dataset = reader.read(params.pop("train_data_path", None)) valid_dataset = reader.read(params.pop("validation_data_path", None)) test_data_path = params.pop("test_data_path", None) if test_data_path: test_dataset = reader.read(test_data_path) vocab = Vocabulary.from_instances(train_dataset + valid_dataset + test_dataset) else: test_dataset = None vocab = Vocabulary.from_instances(train_dataset + valid_dataset) model_params = params.pop("model", None) model = Model.from_params(model_params.duplicate(), vocab=vocab) vocab.save_to_files(os.path.join(args.output_dir, "vocabulary")) # copy config file with open(args.config_path, "r", encoding="utf-8") as f_in: with open(os.path.join(args.output_dir, "config.json"), "w", encoding="utf-8") as f_out: f_out.write(f_in.read()) iterator = DataIterator.from_params(params.pop("iterator", None)) iterator.index_with(vocab) trainer_params = params.pop("trainer", None) trainer = Trainer.from_params(model=model, serialization_dir=args.output_dir, iterator=iterator, train_data=train_dataset, validation_data=valid_dataset, params=trainer_params.duplicate()) trainer.train() # evaluate on the test set if test_dataset: logging.info("Evaluating on the test set") import torch # import here to ensure the republication of the experiment model.load_state_dict( torch.load(os.path.join(args.output_dir, "best.th"))) test_metrics = evaluate(model, test_dataset, iterator, cuda_device=trainer_params.pop( "cuda_device", 0), batch_weight_key=None) logging.info(f"Metrics on the test set: {test_metrics}") with open(os.path.join(args.output_dir, "test_metrics.txt"), "w", encoding="utf-8") as f_out: f_out.write(f"Metrics on the test set: {test_metrics}") cleanup_global_logging(stdout_handler)
def read_squad_word_char(file_path): token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars") } reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print(len(word2idx)) print(len(char2idx)) print(char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print(padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print(tensor_dict['passage']['tokens'].shape) print(tensor_dict['passage']['chars'].shape) print(tensor_dict['question']['tokens'].shape) print(tensor_dict['question']['chars'].shape) print(tensor_dict['span_start'].shape) print(tensor_dict['span_end'].shape)
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read('tests/fixtures/data/squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab, params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def run_config(config): params = Params(json.loads(config)) params_copy = params.duplicate() if 'dataset_reader' in params: reader = DatasetReader.from_params(params.pop('dataset_reader')) else: raise RuntimeError('`dataset_reader` section is required') all_instances = [] if 'train_data_path' in params: print('Reading the training data...') train_data = reader.read(params.pop('train_data_path')) all_instances.extend(train_data) else: raise RuntimeError('`train_data_path` section is required') validation_data = None if 'validation_data_path' in params: print('Reading the validation data...') validation_data = reader.read(params.pop('validation_data_path')) all_instances.extend(validation_data) print('Building the vocabulary...') vocab = Vocabulary.from_instances(all_instances) model = None iterator = None if 'model' not in params: # 'dataset' mode — just preview the (first 10) instances print('Showing the first 10 instances:') for inst in all_instances[:10]: print(inst) else: model = Model.from_params(vocab=vocab, params=params.pop('model')) loader_params = deepcopy(params.pop("data_loader")) train_data_loader = DataLoader.from_params(dataset=train_data, params=loader_params) dev_data_loader = DataLoader.from_params(dataset=validation_data, params=loader_params) iterator.index_with(vocab) # set up a temporary, empty directory for serialization with tempfile.TemporaryDirectory() as serialization_dir: trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, validation_data_loader=dev_data_loader, params=params.pop('trainer')) trainer.train() return { 'params': params_copy, 'dataset_reader': reader, 'vocab': vocab, 'iterator': iterator, 'model': model }
def setUp(self): super().setUp() # TODO make this a set of dataset readers # Classification may be easier in this case. Same dataset reader but with different paths self.instances_list = [] self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging.tsv')) self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging1.tsv')) self.instances_list.append(SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'meta_seq' / 'sequence_tagging2.tsv')) # loop through dataset readers and extend vocab combined_vocab = Vocabulary.from_instances(self.instances_list[0]) for instance in self.instances_list: combined_vocab.extend_from_instances(Params({}), instances=instance) self.vocab = combined_vocab # Figure out params TODO self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(combined_vocab)
def test_console_log_callback(self): total_instances = 1000 batch_size = 25 reader = FakeDatasetReader(total_instances, batch_size) data_loader = SimpleDataLoader.from_dataset_reader( reader, "fake_path", batch_size=batch_size) instances = list(data_loader.iter_instances()) vocab = Vocabulary.from_instances(instances) data_loader.index_with(vocab) model = FakeModel(vocab) optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9) trainer = GradientDescentTrainer( model, optimizer, data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, callbacks=[ ConsoleLoggerCallback.from_params( Params({"should_log_inputs": True}), serialization_dir=self.TEST_DIR, ) ], ) trainer.train()
def get_pico_label_vocab(): labels = ['O', 'I-PAR', 'I-OUT', 'I-INT'] labels = [ Instance({'label': LabelField(l, label_namespace='labels')}) for l in labels ] return Vocabulary.from_instances(labels)
def init_vocab(self,file_path): # datareader = IMDBDatasetReader(1) if(file_path and os.path.isfile(file_path)): instance = self.datareader.read(file_path) self.vocab = Vocabulary.from_instances(instance) else: self.vocab = None
def test_trainer_can_log_batch_inputs(self): total_instances = 1000 batch_size = 25 reader = FakeDatasetReader(total_instances, batch_size) data_loader = SimpleDataLoader.from_dataset_reader( reader, "fake_path", batch_size=batch_size) instances = list(data_loader.iter_instances()) vocab = Vocabulary.from_instances(instances) data_loader.index_with(vocab) model = FakeModel(vocab) optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9) trainer = GradientDescentTrainer( model, optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=[ TensorBoardCallback( serialization_dir=self.TEST_DIR, distribution_interval=2, ) ], ) trainer.train()
def build_vocab(instances: Iterable[Instance] = None, from_transformer: bool = False) -> Vocabulary: """ Build the Vocabulary object from the instances only, or from the pretrained transformer, based on boolean flag :param instances: Iterable of allennlp instances. :param from_transformer: Whether to initialize vocab from pretrained transformer, or from instances directly. :return Vocabulary: The Vocabulary object. """ # log.debug("Building the vocabulary.") if from_transformer: vocab = Vocabulary.from_pretrained_transformer( model_name="bert-base-uncased") elif instances: vocab = Vocabulary.from_instances(instances) else: print("No instances to create vocab with, and pretrained" " transformer isn't being used.") raise UnskippableSituationError() return vocab
def test(): reader = NameReader() instances = reader.read('./data/first_names.all.txt') instances = ensure_list(instances) # expected few names fields = instances[0].fields logger.info(fields) tokens = [t.text for t in fields['tokens']] logger.info(tokens) fields = instances[1].fields tokens = [t.text for t in fields['tokens']] logger.info(tokens) instances[0].fields # Now we need to create a small vocabulary from our sentence- Note that we have used # only character indexers, we we call Vocabulary.from_instances, this will create # vocabulary which correspond to the namespaces of each token indexer in our Text Field's # build vocabulary vocab = Vocabulary.from_instances(instances) print("This is the token ids vocabulary we created \n") print(vocab.get_index_to_token_vocabulary('character_vocab')) for instance in instances: instance.index_fields(vocab) # get the tensor dict logger.info(instances[0].as_tensor_dict())
def setup_method(self): super().setup_method() self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") self.reader = SequenceTaggingDatasetReader() self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2) self.data_loader_lazy = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2, max_instances_in_memory=10 ) self.instances = list(self.data_loader.iter_instances()) self.vocab = Vocabulary.from_instances(self.instances) self.data_loader.index_with(self.vocab) self.data_loader_lazy.index_with(self.vocab) self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.validation_data_loader = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2 ) self.validation_data_loader.index_with(self.vocab)
def create_save_vocab(file_path, target_dir, word_min_count, char_min_count): namespace_word = "word2idx" namespace_char = "char2idx" token_indexers = { "tokens": SingleIdTokenIndexer(namespace=namespace_word), "chars": TokenCharactersIndexer(namespace=namespace_char) } min_count = { namespace_word: word_min_count, namespace_char: char_min_count } reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) vocab = Vocabulary.from_instances(instances, min_count=min_count) word_cnt = vocab.get_vocab_size(namespace_word) char_cnt = vocab.get_vocab_size(namespace_char) vocab.save_to_files(target_dir) print("save word2idx={}, char2idx={} to {}".format(word_cnt, char_cnt, target_dir)) word2idx = vocab.get_index_to_token_vocabulary(namespace_word) char2idx = vocab.get_index_to_token_vocabulary(namespace_char) print(char2idx) vocab = Vocabulary.from_files(target_dir) char2idx = vocab.get_index_to_token_vocabulary(namespace_char) print(char2idx) return
def run(self, reader: DatasetReader, splits: Dict[str, str]) -> DatasetDict: # type: ignore """ * `reader` specifies the old-school dataset reader to use. * `splits` maps the names of the splits to the filenames to use for the dataset reader. It might look like this: ``` { "train": "/path/to/train.json", "validation": "/path/to/validation.json" } ``` """ instances_map: Dict[str, Sequence[Instance]] = { split_name: list(tqdm(reader.read(path), desc=f"Reading {path}")) for split_name, path in splits.items() } vocab = Vocabulary.from_instances( itertools.chain(*instances_map.values())) # index all the instances with the vocab for split_name, instances in instances_map.items(): for instance in tqdm(instances, desc=f"Indexing {split_name}"): instance.index_fields(vocab) return DatasetDict(splits=instances_map, vocab=vocab)
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read(self.FIXTURES_ROOT / 'data' / 'squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_embedders'][ 'token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab=vocab, params=params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def test_from_params(self): for config_path in CONFIG_DIR.glob("*.jsonnet"): params = Params.from_file(str(config_path), ext_vars={ "TRAIN_DATA_PATH": "", "VALID_DATA_PATH": "" }) data_reader_params = params["dataset_reader"] data_reader_params.pop("type") reader = openvaccine.CovidReader.from_params(data_reader_params) instances = reader.read(PROJECT_ROOT / "data" / "sample.jsonl") vocab = Vocabulary.from_instances(instances) batch = Batch(instances) batch.index_instances(vocab) try: model = Model.from_params(params=params["model"], vocab=vocab) except Exception as e: raise AssertionError( f"unable to load params from {config_path}, because {e}") output_dict = model(**batch.as_tensor_dict()) assert set(output_dict.keys()) == { "logits", "seq_id", "loss", } assert len(output_dict["logits"].shape) == 3 assert isinstance(output_dict["seq_id"][0], str)
def test_batch_predictions_are_consistent(self): # The same issue as the bidaf test case. # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. So, we'll remove the CNN encoder entirely from the model for this test. # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params["dataset_reader"]) reader._token_indexers = {"tokens": reader._token_indexers["tokens"]} self.instances = reader.read(FIXTURES_ROOT / "data" / "squad.json") vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params["model"]["text_field_embedder"]["token_embedders"][ "token_characters"] params["model"]["phrase_layer"]["num_convs_per_block"] = 0 params["model"]["modeling_layer"]["num_convs_per_block"] = 0 self.model = Model.from_params(vocab=vocab, params=params["model"]) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def test_create_models_from_allennlp_configs(self, config_path): params = Params.from_file( str(config_path), ext_vars={ "CLF_TRAIN_DATA_PATH": "", "CLF_VALID_DATA_PATH": "", "DISCRETIZER_PATH": str(DISCRETIZER_PATH), "VOCAB_PATH": str(VOCAB_PATH), }, ) reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(DATA_PATH) vocab = Vocabulary.from_instances(instances) num_labels = vocab.get_vocab_size(namespace="labels") batch = Batch(instances) batch.index_instances(vocab) try: model = Model.from_params(params=params["model"], vocab=vocab) except Exception as e: raise AssertionError(f"unable to load params from {config_path}") from e output_dict = model(**batch.as_tensor_dict()) assert "probs" in output_dict assert len(output_dict["probs"].shape) == 2 assert output_dict["probs"].shape[0] == len(instances) assert output_dict["probs"].shape[1] == num_labels
def setUp(self): super().setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" ) self.instances_lazy = SequenceTaggingDatasetReader(lazy=True).read( self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" ) vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate) self.data_loader_lazy = DataLoader( self.instances_lazy, batch_size=2, collate_fn=allennlp_collate ) self.validation_data_loader = DataLoader( self.instances, batch_size=2, collate_fn=allennlp_collate ) self.instances.index_with(vocab) self.instances_lazy.index_with(vocab)
def setUp(self): self.reader = ToyReader() self.train_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/train/toy_train.txt") self.dev_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/dev/toy_dev.txt") self.vocab = Vocabulary.from_instances(self.train_instances + self.dev_instances) token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size('tokens') + 2, embedding_dim=256, padding_index=0) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2SeqWrapper(nn.LSTM(input_size=word_embeddings.get_output_dim(), num_layers=2, hidden_size=256, bidirectional=True, dropout=0.4, batch_first=True)) # self.set_up_model(model_params_file_path, dataset_sample_file_path) self.model = SimpleSeq2Seq(vocab=self.vocab, source_embedder=word_embeddings, encoder=encoder, target_embedding_dim=256, target_namespace='target_tokens', attention=DotProductAttention(), max_decoding_steps=25, beam_size=5, use_bleu=True ) self.model.cuda(0)
def setUp(self): super().setUp() # A lot of the tests want access to the metric tracker # so we add a property that gets it by grabbing it from # the relevant callback. def metric_tracker(self: CallbackTrainer): for callback in self.handler.callbacks(): if isinstance(callback, TrackMetrics): return callback.metric_tracker return None setattr(CallbackTrainer, 'metric_tracker', property(metric_tracker)) self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
def get_trainer_from_config( config: Params, train_instances: List[Instance], val_instances: List[Instance], device: int, serialization_dir: Optional[str] = None) -> Trainer: trainer_params = config.pop("trainer") trainer_params["cuda_device"] = device model_params = config.pop("model") vocab_dir = config.pop("vocab_dir", None) if vocab_dir is None: vocab = Vocabulary.from_instances(train_instances) else: vocab = Vocabulary.from_files(vocab_dir) model = Model.from_params(model_params, vocab=vocab) iterator = DataIterator.from_params(config.pop("iterator")) trainer_params["num_serialized_models_to_keep"] = 1 iterator.index_with(vocab) trainer = Trainer.from_params(model=model, iterator=iterator, train_data=train_instances, validation_data=val_instances, serialization_dir=serialization_dir, params=trainer_params) return trainer
def setUp(self): super(TestTrainer, self).setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)
def setUp(self): super(TestTrainer, self).setUp() dataset = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_instances(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(self.vocab, self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2)
def _yield_one_epoch(self, instances: Iterable[Instance], shuffle: bool, cuda_device: int, for_training: bool): batches = self._create_batches(instances, shuffle) for batch in batches: # raw batch.index_instances(self.vocab) padding_lengths = batch.get_padding_lengths() logger.debug("Batch padding lengths: %s", str(padding_lengths)) logger.debug("Batch size: %d", len(batch.instances)) forword_input = { 'raw': batch.as_tensor_dict(padding_lengths, cuda_device=cuda_device, for_training=for_training) } # extended extend_vocab = Vocabulary.from_instances(batch.instances) self.vocab.extend_from(extend_vocab) batch.index_instances(self.vocab) padding_lengths = batch.get_padding_lengths() logger.debug("Batch padding lengths: %s", str(padding_lengths)) logger.debug("Batch size: %d", len(batch.instances)) forword_input.update({ 'extended': batch.as_tensor_dict(padding_lengths, cuda_device=cuda_device, for_training=for_training) }) # instance for metrics forword_input.update({'instances': batch.instances}) yield forword_input
def test_from_params(self, data_path: str, sentence_marker_params: Params, ccm_params: Params) -> None: reader = DatasetReader.from_params(sentence_marker_params) instances = reader.read(data_path) vocab = Vocabulary.from_instances(instances) ccm_module = ConstrainedConditionalModule.from_params(vocab=vocab, params=ccm_params) index = vocab.get_token_index("I-type", "labels") assert ccm_module._sentence_penalty_map == (index, 50.)
def set_up_model(self, param_file, dataset_file): self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params["dataset_reader"]) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = reader.read(str(dataset_file)) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) self.model = Model.from_params(vocab=self.vocab, params=params["model"]) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab)
def read_squad_allennlp(file_path): '''read data, build vocab, batch, padding, to idx Args: file_path -- raw squad json file Returns: None ''' token_indexers = { "tokens": SingleIdTokenIndexer(namespace="token_ids"), "chars": TokenCharactersIndexer(namespace="token_chars")} reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) for instance in instances: question = instance.fields['question'] print (question) print (type(question)) break vocab = Vocabulary.from_instances(instances) word2idx = vocab.get_index_to_token_vocabulary("token_ids") char2idx = vocab.get_index_to_token_vocabulary("token_chars") #print (word2idx) print (len(word2idx)) print (len(char2idx)) print (char2idx) batch = Batch(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() print (padding_lengths) tensor_dict = batch.as_tensor_dict(padding_lengths) print (tensor_dict['passage']['tokens'].shape) print (tensor_dict['passage']['chars'].shape) print (tensor_dict['question']['tokens'].shape) print (tensor_dict['question']['chars'].shape) print (tensor_dict['span_start'].shape) print (tensor_dict['span_end'].shape)
def test_batch_predictions_are_consistent(self): # The same issue as the bidaf test case. # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. So, we'll remove the CNN encoder entirely from the model for this test. # Save some state. # pylint: disable=protected-access,attribute-defined-outside-init saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read(self.FIXTURES_ROOT / 'data' / 'squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_embedders'][ 'token_characters'] params['model']['phrase_layer']['num_convs_per_block'] = 0 params['model']['modeling_layer']['num_convs_per_block'] = 0 self.model = Model.from_params(vocab=vocab, params=params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = list(reader.read(str(dataset_file))) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def build_vocab(self, pipeline: "Pipeline", lazy: bool = False) -> Vocabulary: """Build the configured vocabulary Parameters ---------- pipeline The pipeline used to create the instances from which the vocabulary is built. lazy If true, instances are lazily loaded from disk, otherwise they are loaded into memory. Returns ------- vocab """ vocab = Vocabulary.from_instances( instances=( instance for dataset in self.datasets for instance in dataset.to_instances(pipeline, lazy=lazy)), max_vocab_size=self.max_vocab_size, min_count=self.min_count, pretrained_files=self.pretrained_files, only_include_pretrained_words=self.only_include_pretrained_words, min_pretrained_embeddings=self.min_pretrained_embeddings, tokens_to_add=self.tokens_to_add, ) return vocab
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(self.vocab, params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer, 'tokens': indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def setUp(self): super(TestOptimizer, self).setUp() self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def setUp(self): super(TestTrainer, self).setUp() self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)