def test_elmo_empty_token_list(self): indexer = CustomELMoTokenCharactersIndexer() indexer = {'elmo': indexer} tokens_1 = TextField([Token('Apple')], indexer) targets_1 = ListField([TextField([Token('Apple')], indexer)]) tokens_2 = TextField([Token('Screen'), Token('device')], indexer) targets_2 = ListField([ TextField([Token('Screen')], indexer), TextField([Token('Device')], indexer) ]) instance_1 = Instance({'tokens': tokens_1, 'targets': targets_1}) instance_2 = Instance({'tokens': tokens_2, 'targets': targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor['targets']['elmo']['tokens'] empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1] ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
def forward_on_instances(self, instances: List[Instance], **kwargs) -> List[Dict[str, np.ndarray]]: # An exact copy of the original method, but supports kwargs batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = self.make_output_human_readable( self(**model_input, **kwargs)) instance_separated_output: List[Dict[str, np.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def allennlp_collate(instances: List[Instance]) -> TensorDict: """ This is the default function used to turn a list of `Instance`s into a `TensorDict` batch. """ batch = Batch(instances) return batch.as_tensor_dict()
def set_up_model(self, param_file, dataset_file): self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params["dataset_reader"]) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = reader.read(str(dataset_file)) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) self.model = Model.from_params(vocab=self.vocab, params=params["model"]) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab)
def test_elmo_empty_token_list(self): # Basic test indexer = ELMoTokenCharactersIndexer() assert {"elmo_tokens": []} == indexer.get_empty_token_list() # Real world test indexer = {"elmo": indexer} tokens_1 = TextField([Token("Apple")], indexer) targets_1 = ListField([TextField([Token("Apple")], indexer)]) tokens_2 = TextField([Token("Screen"), Token("device")], indexer) targets_2 = ListField([ TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer) ]) instance_1 = Instance({"tokens": tokens_1, "targets": targets_1}) instance_2 = Instance({"tokens": tokens_2, "targets": targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor["targets"]["elmo"][ "elmo_tokens"] # The TextField that is empty should have been created using the # `get_empty_token_list` and then padded with zeros. empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1], ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). # Parameters batch : `List[List[str]]`, required A list of tokenized sentences. # Returns A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()["elmo"]["character_ids"]["tokens"]
def setUp(self) -> None: super().setUp() param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet" dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt" self.param_file = param_file params = Params.from_file(self.param_file) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(str(dataset_file)) # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params( params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) # 加载模型 self.model = Model.from_params(params=params["model"], vocab=self.vocab) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab) self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: """ This method should return one epoch worth of batches. """ hoppers: Dict[Any, List[Instance]] = defaultdict(list) for instance in instances: # Which hopper do we put this instance in? if self.allow_mixed_batches: instance_type = "" else: instance_type = instance.fields[ self.type_field_name].metadata # type: ignore hoppers[instance_type].append(instance) # If the hopper is full, yield up the batch and clear it. if len(hoppers[instance_type]) >= self._batch_size: yield Batch(hoppers[instance_type]) hoppers[instance_type].clear() # Deal with leftovers for remaining in hoppers.values(): if remaining: yield Batch(remaining)
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) training_tensors = dataset.as_tensor_dict() output_dict = self.model(**training_tensors) probs = output_dict["class_probabilities"] assert probs.size() == (2, 7, self.model.vocab.get_vocab_size("labels"))
def convert_documents_to_batch(self, documents: List[Tuple[List[Token], List[Token]]], vocabulary) -> Dict[str, Any]: batch = Batch( [self.convert_tokens_to_instance(tokens) for tokens in documents]) batch.index_instances(vocabulary) batch = batch.as_tensor_dict() return batch["document"]
def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == { "text1": { "tokens___tokens": 5 }, "text2": { "tokens___tokens": 6 } }
def get_gradients( self, instances: List[Instance] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ Gets the gradients of the loss with respect to the model inputs. # Parameters instances: List[Instance] # Returns Tuple[Dict[str, Any], Dict[str, Any]] The first item is a Dict of gradient entries for each input. The keys have the form `{grad_input_1: ..., grad_input_2: ... }` up to the number of inputs given. The second item is the model's output. Notes ----- Takes a `JsonDict` representing the inputs of the model and converts them to [`Instances`](../data/instance.md)), sends these through the model [`forward`](../models/model.md#forward) function after registering hooks on the embedding layer of the model. Calls `backward` on the loss and then removes the hooks. """ embedding_gradients: List[Tensor] = [] hooks: List[RemovableHandle] = self._register_embedding_gradient_hooks( embedding_gradients) dataset = Batch(instances) dataset.index_instances(self._model.vocab) dataset_tensor_dict = util.move_to_device(dataset.as_tensor_dict(), self.cuda_device) # To bypass "RuntimeError: cudnn RNN backward can only be called in training mode" with backends.cudnn.flags(enabled=False): outputs = self._model.make_output_human_readable( self._model.forward(**dataset_tensor_dict) # type: ignore ) loss = outputs["loss"] self._model.zero_grad() loss.backward() for hook in hooks: hook.remove() grad_dict = dict() for idx, grad in enumerate(embedding_gradients): key = "grad_input_" + str(idx + 1) grad_dict[key] = grad.detach().cpu().numpy() return grad_dict, outputs
def compose_batch_stream(ins_stream: Generator[Instance, None, None], batch_size: int = 12) -> Generator[Batch, None, None]: buffer = [] while True: try: buffer.append(next(ins_stream)) if len(buffer) == batch_size: yield Batch(buffer) buffer.clear() except StopIteration: break if len(buffer) != 0: yield Batch(buffer)
def forward_on_instances( self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any `torch.Tensors` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. # Parameters instances : List[Instance], required The instances to run the model on. # Returns A list of the models output for each instance. """ batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable. # This occurs with batch size 1, because we still want to include the loss in that case. if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer, "tokens": indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # Shuffle the documents if requested. maybe_shuffled_instances = self._shuffle_documents(instances) if shuffle else instances for instance_list in self._memory_sized_lists(maybe_shuffled_instances): iterator = iter(instance_list) excess: Deque[Instance] = deque() # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess): batch = Batch(possibly_smaller_batches) yield batch if excess: yield Batch(excess)
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) training_tensors = dataset.as_tensor_dict() output_dict = self.model(**training_tensors) tags = output_dict["tags"] assert len(tags) == 2 assert len(tags[0]) == 7 assert len(tags[1]) == 7 for example_tags in tags: for tag_id in example_tags: tag = self.model.vocab.get_token_from_index(tag_id, namespace="labels") assert tag in {"O", "I-ORG", "I-PER", "I-LOC"}
def setUp(self) -> None: super().setUp() param_file = FIXTURES_ROOT / "pointer_rewrite" / "bert_transformer_pointer_rewrite.jsonnet" dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt" self.param_file = param_file params = Params.from_file(self.param_file) # 构建适用于bert model的词表,和vocabulary词表保持一致 vocab_path = params["dataset_reader"]["vocab_path"] # 新生成的bert词表的路径 bert_temp_dir = tempfile.mkdtemp(suffix="bert") with open(Path(vocab_path) / "tokens.txt", 'r', encoding="utf-8") as f, \ open(Path(bert_temp_dir) / "vocab.txt", 'w', encoding="utf-8") as fp: fp.write("[PAD]" + "\n") for line in f: line = line.strip() fp.write(line) fp.write("\n") # 改写config中的部分参数 overrides_config = { "dataset_reader.model_name": bert_temp_dir, "model.model_name": params["model"]["model_name"] + "/config.json" } self.overrides_config = json.dumps(overrides_config) params = Params.from_file(self.param_file, params_overrides=self.overrides_config) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(str(dataset_file)) # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) # 加载模型 # 将模型对应的model_name改成对应的config文件 self.model = Model.from_params(params=params["model"], vocab=self.vocab) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab) self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()["elmo"]["character_ids"]["elmo_tokens"]
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) # Divvy up the instances based on their value of the "partition_key" field. hoppers: Dict[str, List[Instance]] = defaultdict(list) for instance in instance_list: partition = instance.fields[ self._partition_key].metadata # type: ignore hoppers[partition].append(instance) # Get a `lazy_groups_of` iterator over each set of homogeneous instances. batches = { key: lazy_groups_of(iter(hopper), self._batch_size) for key, hopper in hoppers.items() } remaining = set(batches) # Yield batches in a round-robin fashion until none are left. while remaining: for key, lazy_batches in batches.items(): if key in remaining: try: batch = next(lazy_batches) if not self._skip_smaller_batches or len( batch) == self._batch_size: yield Batch(batch) except StopIteration: remaining.remove(key)
def instances_to_captum_inputs(self, labeled_instances): batch_size = len(labeled_instances) with torch.no_grad(): cuda_device = self._get_prediction_device() batch = Batch(labeled_instances) batch.index_instances(self.vocab) model_input = util.move_to_device(batch.as_tensor_dict(), cuda_device) input_ids = model_input["tokens"]["tokens"]["token_ids"] label = model_input["label"] attention_mask = model_input["tokens"]["tokens"]["mask"] embedded_tokens = self.embeddings(input_ids) output_dict = {} output_dict["embedding"] = embedded_tokens return (embedded_tokens, ), None, (attention_mask, label, output_dict)
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) iterator = iter(instance_list) excess: Deque[Instance] = deque() # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances, excess): batch = Batch(possibly_smaller_batches) yield batch if excess: yield Batch(excess)
def forward_on_instances( self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: """ 我省略了复杂繁琐的检查,因为这会导致模型最后可能没有输出 :param instances: :return: """ batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = self.decode(self(**model_input)) return outputs
def setup_method(self): token_indexer = SingleIdTokenIndexer("tokens") self.pairs_fname = ( "https://raw.githubusercontent.com/tolga-b/debiaswe/" "4c3fa843ffff45115c43fe112d4283c91d225c09/data/definitional_pairs.json" ) with open(cached_path(self.pairs_fname)) as f: pairs_list = [] [ pairs_list.extend([ w1.lower(), w2.lower(), w1.title(), w2.title(), w1.upper(), w2.upper() ]) for w1, w2 in json.load(f) ] text_field = TextField( [Token(t) for t in pairs_list], {"tokens": token_indexer}, ) instance = Instance({"text": text_field}) dataset = Batch([instance]) self.pairs_vocab = Vocabulary.from_instances(dataset) self.num_pairs = len(set(pairs_list)) self.singles_fname = ( "https://raw.githubusercontent.com/tolga-b/debiaswe/" "4c3fa843ffff45115c43fe112d4283c91d225c09/data/gender_specific_full.json" ) with open(cached_path(self.singles_fname)) as f: singles_list = json.load(f) text_field = TextField( [Token(t) for t in singles_list], {"tokens": token_indexer}, ) instance = Instance({"text": text_field}) dataset = Batch([instance]) self.singles_vocab = Vocabulary.from_instances(dataset) self.num_singles = len(set(singles_list)) super().setup_method()
def preprocess(self, token_batch): seq_lens = [len(sequence) for sequence in token_batch if sequence] if not seq_lens: return [] max_len = min(max(seq_lens), self.max_len) batches = [] for indexer in self.indexers: batch = [] for sequence in token_batch: tokens = sequence[:max_len] tokens = [Token(token) for token in ['$START'] + tokens] batch.append(Instance({'tokens': TextField(tokens, indexer)})) batch = Batch(batch) batch.index_instances(self.vocab) batches.append(batch) return batches
def test_batch_count(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)) assert len(dataloader) == 3
class TestLSTMPointerForRewrite(TestCase): def setUp(self) -> None: super().setUp() param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet" dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt" self.param_file = param_file params = Params.from_file(self.param_file) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(str(dataset_file)) # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params( params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) # 加载模型 self.model = Model.from_params(params=params["model"], vocab=self.vocab) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab) self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests")) def test_model_can_train_save_and_load(self): save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" # test train and save model = train_model_from_file(self.param_file, save_dir) # test load loaded_model = load_archive(archive_file, cuda_device=-1).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # make sure that the state dict (the parameters) are the same # for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key)
def setUp(self): token_indexer = SingleIdTokenIndexer("tokens") text_field = TextField( [Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]], {"tokens": token_indexer}, ) self.instance = Instance({"text": text_field}) self.dataset = Batch([self.instance]) super().setUp()
def __iter__(self) -> Iterator[TensorDict]: epoch_instances = self._get_instances_for_epoch() return ( nn_util.move_to_device( Batch(instances).as_tensor_dict(), -1 if self.cuda_device is None else self.cuda_device, ) for instances in self.scheduler.batch_instances(epoch_instances) )
def setup_model(params_file, dataset_file): params = Params.from_file(params_file) #reader = DatasetReader.from_params(params['dataset_reader']) reader = ToxicReader() instances = reader.read(str(dataset_file)) Vocabulary.from_instances(instances) if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) vocab.save_to_files("new_vocab2") dataset = Batch(instances) dataset.index_instances(vocab) print(dataset.as_tensor_dict())