def _create_batches(self, dataset: Dataset, shuffle: bool) -> List[List[Instance]]: instances = dataset.instances if shuffle: random.shuffle(instances) grouped_instances = group_by_count(instances, self._batch_size, None) # The last group might have not been full, so we check if any of the instances # are None, which is how group_by_count pads non-complete batches. grouped_instances[-1] = [instance for instance in grouped_instances[-1] if instance is not None] return grouped_instances
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: instances = ensure_list(instances) if shuffle: random.shuffle(instances) grouped_instances = group_by_count(instances, self._batch_size, None) # The last group might have not been full, so we check if any of the instances # are None, which is how group_by_count pads non-complete batches. grouped_instances[-1] = [ instance for instance in grouped_instances[-1] if instance is not None ] return (Batch(batch) for batch in grouped_instances)
def test_group_by_count(self): assert util.group_by_count([1, 2, 3, 4, 5, 6, 7], 3, 20) == [ [1, 2, 3], [4, 5, 6], [7, 20, 20], ]
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs] flattened_instances = [instance for sentence_instances in instances_per_sentence for instance in sentence_instances] if not flattened_instances: return sanitize([{"verbs": [], "words": self._tokenizer.split_words(x["sentence"])} for x in inputs]) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [instance for instance in batched_instances[-1] if instance is not None] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances(batch)) verbs_per_sentence = [len(sent) for sent in instances_per_sentence] return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs] output_index = 0 for sentence_index, verb_count in enumerate(verbs_per_sentence): if verb_count == 0: # We didn't run any predictions for sentences with no verbs, # so we don't have a way to extract the original sentence. # Here we just tokenize the input again. original_text = self._tokenizer.split_words(inputs[sentence_index]["sentence"]) return_dicts[sentence_index]["words"] = original_text continue for _ in range(verb_count): output = outputs[output_index] words = output["words"] tags = output['tags'] description = self.make_srl_string(words, tags) return_dicts[sentence_index]["words"] = words return_dicts[sentence_index]["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) output_index += 1 return sanitize(return_dicts)
def test_group_by_count(self): assert util.group_by_count([1, 2, 3, 4, 5, 6, 7], 3, 20) == [[1, 2, 3], [4, 5, 6], [7, 20, 20]]
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence, return_dicts = zip( *[self._sentence_to_srl_instances(json) for json in inputs]) flattened_instances = [ instance for sentence_instances in instances_per_sentence for instance in sentence_instances ] if not flattened_instances: return sanitize(return_dicts) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [ instance for instance in batched_instances[-1] if instance is not None ] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances(batch)) sentence_index = 0 for results in return_dicts: # We just added the verbs to the list in _sentence_to_srl_instances # but we actually want to replace them with their frames, so we # reset them here. verbs_for_sentence: List[str] = results["verbs"] results["verbs"] = [] # The verbs are in order, but nested as we have multiple sentences. # The outputs are already flattened from running through the model, # so we just index into this flat list for each verb, updating as we go. for verb in verbs_for_sentence: output = outputs[sentence_index] tags = output['tags'] description = self.make_srl_string(results["words"], tags) results["verbs"].append({ "verb": verb, "description": description, "tags": tags, }) sentence_index += 1 return sanitize(return_dicts)
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence = [ self._sentence_to_srl_instances(json) for json in inputs ] flattened_instances = [ instance for sentence_instances in instances_per_sentence for instance in sentence_instances ] if not flattened_instances: return sanitize([{ "verbs": [], "words": self._tokenizer.split_words(x["sentence"]) } for x in inputs]) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [ instance for instance in batched_instances[-1] if instance is not None ] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances(batch)) verbs_per_sentence = [len(sent) for sent in instances_per_sentence] return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs] output_index = 0 for sentence_index, verb_count in enumerate(verbs_per_sentence): if verb_count == 0: # We didn't run any predictions for sentences with no verbs, # so we don't have a way to extract the original sentence. # Here we just tokenize the input again. original_text = self._tokenizer.split_words( inputs[sentence_index]["sentence"]) return_dicts[sentence_index]["words"] = original_text continue for _ in range(verb_count): output = outputs[output_index] words = output["words"] tags = output['tags'] description = self.make_srl_string(words, tags) return_dicts[sentence_index]["words"] = words return_dicts[sentence_index]["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) output_index += 1 return sanitize(return_dicts)