def predict_json(self, inputs: JsonDict) -> JsonDict: """ Expects JSON that looks like ``{"sentence": "..."}`` and returns JSON that looks like .. code-block:: js {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} """ instances = self._sentence_to_srl_instances(inputs) if not instances: return sanitize({"verbs": [], "words": self._tokenizer.split_words(inputs["sentence"])}) outputs = self._model.forward_on_instances(instances) results = {"verbs": [], "words": outputs[0]["words"]} for output in outputs: tags = output['tags'] description = self.make_srl_string(output["words"], tags) results["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) return sanitize(results)
def test_sanitize(self): assert util.sanitize(torch.Tensor([1, 2])) == [1, 2] assert util.sanitize(torch.LongTensor([1, 2])) == [1, 2] with pytest.raises(ValueError): util.sanitize(Unsanitizable()) assert util.sanitize(Sanitizable()) == {"sanitizable": True}
def predict_tokenized(self, tokenized_sentence: List[str]) -> JsonDict: """ Predicts the semantic roles of the supplied sentence tokens and returns a dictionary with the results. Parameters ---------- tokenized_sentence, ``List[str]`` The sentence tokens to parse via semantic role labeling. Returns ------- A dictionary representation of the semantic roles in the sentence. """ spacy_doc = self._tokenizer.spacy.tokenizer.tokens_from_list(tokenized_sentence) for pipe in filter(None, self._tokenizer.spacy.pipeline): pipe[1](spacy_doc) tokens = [token for token in spacy_doc] instances = self.tokens_to_instances(tokens) if not instances: return sanitize({"verbs": [], "words": tokens}) return self.predict_instances(instances)
def predict_json(self, inputs: JsonDict) -> JsonDict: instance, return_dict = self._my_json_to_instance(inputs) world = instance.fields['world'].metadata # type: ignore outputs = self._model.forward_on_instance(instance) answer_index = outputs['answer_index'] if answer_index == 0: answer = "A" elif answer_index == 1: answer = "B" else: answer = "None" outputs['answer'] = answer return_dict.update(outputs) if answer != "None": explanation = get_explanation(return_dict['logical_form'], return_dict['world_extractions'], answer_index, world) else: explanation = [{"header": "No consistent interpretation found!", "content": []}] return_dict['explanation'] = explanation return sanitize(return_dict)
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: outputs = self._model.forward_on_instances(instances) for output in outputs: # format the NLTK tree as a string on a single line. tree = output.pop("trees") output["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True) output["trees"] = tree.pformat(margin=1000000) return sanitize(outputs)
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) # format the NLTK tree as a string on a single line. tree = outputs.pop("trees") outputs["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True) outputs["trees"] = tree.pformat(margin=1000000) return sanitize(outputs)
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) words = outputs["words"] pos = outputs["pos"] heads = outputs["predicted_heads"] tags = outputs["predicted_dependencies"] outputs["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos) return sanitize(outputs)
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: outputs = self._model.forward_on_instances(instances) for output in outputs: words = output["words"] pos = output["pos"] heads = output["predicted_heads"] tags = output["predicted_dependencies"] output["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos) return sanitize(outputs)
def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict: instance, return_dict = self._json_to_instance(inputs) outputs = self._model.forward_on_instance(instance, cuda_device) return_dict.update(outputs) # format the NLTK tree as a string on a single line. tree = return_dict.pop("trees") return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True) return_dict["trees"] = tree.pformat(margin=1000000) return sanitize(return_dict)
def predict_batch_json(self, inputs: List[JsonDict], cuda_device: int = -1) -> List[JsonDict]: instances, return_dicts = zip(*self._batch_json_to_instances(inputs)) outputs = self._model.forward_on_instances(instances, cuda_device) for output, return_dict in zip(outputs, return_dicts): return_dict.update(output) # format the NLTK tree as a string on a single line. tree = return_dict.pop("trees") return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True) return_dict["trees"] = tree.pformat(margin=1000000) return sanitize(return_dicts)
def predict_json(self, inputs: JsonDict) -> JsonDict: """ Create instance(s) after predicting the format. One sentence containing multiple verbs will lead to multiple instances. Expects JSON that looks like ``{"sentence": "..."}`` Returns a JSON that looks like .. code-block:: js {"tokens": [...], "tag_spans": [{"ARG0": "...", "V": "...", "ARG1": "...", ...}]} """ sent_tokens = self._tokenizer.tokenize(inputs["sentence"]) # Find all verbs in the input sentence pred_ids = [i for (i, t) in enumerate(sent_tokens) if t.pos_ == "VERB"] # Create instances instances = [self._json_to_instance({"sentence": sent_tokens, "predicate_index": pred_id}) for pred_id in pred_ids] # Run model outputs = [[sanitize_label(label) for label in self._model.forward_on_instance(instance)["tags"]] for instance in instances] # Consolidate predictions pred_dict = consolidate_predictions(outputs, sent_tokens) # Build and return output dictionary results = {"verbs": [], "words": sent_tokens} for tags in pred_dict.values(): # Join multi-word predicates tags = join_mwp(tags) # Create description text description = make_oie_string(sent_tokens, tags) # Add a predicate prediction to the return dictionary. results["verbs"].append({ "verb": get_predicate_text(sent_tokens, tags), "description": description, "tags": tags, }) return sanitize(results)
def predict_instances(self, instances: List[Instance]) -> JsonDict: outputs = self._model.forward_on_instances(instances) results = {"verbs": [], "words": outputs[0]["words"]} for output in outputs: tags = output['tags'] description = self.make_srl_string(output["words"], tags) results["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) return sanitize(results)
def predict_json(self, inputs: JsonDict) -> JsonDict: """ Expects JSON that looks like ``{"sentence": "..."}`` and returns JSON that looks like .. code-block:: js {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} """ instances = self._sentence_to_srl_instances(inputs) if not instances: return sanitize({"verbs": [], "words": self._tokenizer.split_words(inputs["sentence"])}) return self.predict_instances(instances)
def attack_from_json( self, inputs: JsonDict, field_to_change: str = 'tokens', field_to_attack: str = 'label', grad_input_field: str = 'grad_input_1', ) -> JsonDict: raw_instance = self.predictor.json_to_labeled_instances(inputs)[0] raw_text_field: TextField = raw_instance[field_to_change] raw_tokens = raw_text_field.tokens adv_instance = deepcopy(raw_instance) adv_text_field: TextField = adv_instance[field_to_change] adv_tokens = adv_text_field.tokens # equal to the raw one... _, outputs = self.predictor.get_gradients([adv_instance]) # set up some states change_positions__ = set() forbidden_idxs__ = set() for forbidden_token in self.forbidden_tokens: if forbidden_token in self.vocab._token_to_index['tokens']: forbidden_idxs__.add( self.vocab._token_to_index['tokens'][forbidden_token]) successful = False for step in range(self.max_step): grads, _ = self.predictor.get_gradients([adv_instance]) grad = torch.from_numpy(grads[grad_input_field][0]).to( self.model_device) grad_norm = grad.norm(dim=-1) position_mask = [False for _ in range(len(adv_tokens))] is_max_changed = len(change_positions__) > self.max_change_num( len(raw_tokens)) for idx, token in enumerate(adv_tokens): if token.text in self.ignore_tokens: position_mask[idx] = True if is_max_changed and idx not in change_positions__: position_mask[idx] = True if all(position_mask): print("All words are forbidden.") break for idx in range(len(position_mask)): if position_mask[idx]: grad_norm[idx] = -1 # select a word and forbid itself token_vids: List[int] = [] new_token_vids: List[int] = [] _, topk_idxs = grad_norm.sort(descending=True) token_sids = select(ordered_idxs=cast_list(topk_idxs), num_to_select=self.iter_change_num, selected=change_positions__, max_num=self.max_change_num(len(raw_tokens))) token_sids = [ ele for ele in token_sids if position_mask[ele] is False ] for token_sid in token_sids: token_grad = grad[token_sid] token_vid = adv_text_field._indexed_tokens["tokens"][token_sid] token_emb = self.token_embedding[token_vid] change_positions__.add(token_sid) forbidden_idxs__.add(token_vid) # print(change_positions__) delta = token_grad / torch.norm(token_grad) * self.step_size new_token_emb = token_emb + delta tk_vals, tk_idxs = self.embed_searcher.find_neighbours( new_token_emb, 'cos', topk=None, rho=None) for tk_idx in cast_list(tk_idxs): if tk_idx in forbidden_idxs__: continue else: new_token_vid = tk_idx break token_vids.append(token_vid) new_token_vids.append(new_token_vid) # flip token new_token = Token(self.vocab._index_to_token["tokens"] [new_token_vid]) # type: ignore adv_text_field.tokens[token_sid] = new_token adv_instance.indexed = False # Get model predictions on current_instance, and then label the instances grads, outputs = self.predictor.get_gradients([adv_instance ]) # predictions for key, output in outputs.items(): outputs[key] = cast_list(outputs[key]) # add labels to current_instances current_instance_labeled = self.predictor.predictions_to_labeled_instances( adv_instance, outputs)[0] # if the prediction has changed, then stop if current_instance_labeled[field_to_attack] != raw_instance[ field_to_attack]: successful = True break return sanitize({ "adv": adv_tokens, "raw": raw_tokens, "outputs": outputs, "success": 1 if successful else 0 })
async def predict(inputs: Dict[str, Any]): with http_error_handling(): return sanitize(pipeline.predict(**inputs))
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: instances, return_dicts = zip(*self._batch_json_to_instances(inputs)) outputs = self._model.forward_on_instances(instances) for output, return_dict in zip(outputs, return_dicts): return_dict.update(output) return sanitize(return_dicts)
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) return sanitize(outputs)
def predict_batch_json(self, inputs: List[JsonDict], cuda_device: int = -1) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence, return_dicts = zip( *[self._sentence_to_srl_instances(json) for json in inputs]) flattened_instances = [ instance for sentence_instances in instances_per_sentence for instance in sentence_instances ] if not flattened_instances: return sanitize(return_dicts) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [ instance for instance in batched_instances[-1] if instance is not None ] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances( batch, cuda_device)) sentence_index = 0 for results in return_dicts: # We just added the verbs to the list in _sentence_to_srl_instances # but we actually want to replace them with their frames, so we # reset them here. verbs_for_sentence: List[str] = results["verbs"] results["verbs"] = [] # The verbs are in order, but nested as we have multiple sentences. # The outputs are already flattened from running through the model, # so we just index into this flat list for each verb, updating as we go. for verb in verbs_for_sentence: output = outputs[sentence_index] tags = output['tags'] description = self.make_srl_string(results["words"], tags) results["verbs"].append({ "verb": verb, "description": description, "tags": tags, }) sentence_index += 1 return sanitize(return_dicts)
def predict_json(self, inputs: JsonDict) -> JsonDict: instance = self._json_to_instance(inputs) outputs = self._model.forward_on_instance(instance) return sanitize(outputs)
def _explore( pipeline: Pipeline, data_source: DataSource, config: ExploreConfiguration, elasticsearch: ElasticsearchExplore, ) -> dd.DataFrame: """ Executes a pipeline prediction over a datasource and register results int a elasticsearch index Parameters ---------- pipeline data_source config elasticsearch Returns ------- """ if config.prediction_cache > 0: pipeline.init_prediction_cache(config.prediction_cache) ddf_mapped = data_source.to_mapped_dataframe() # Stringify input data for better elasticsearch index mapping integration, # avoiding properties with multiple value types (string and long,...) for column in ddf_mapped.columns: ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify) # this only makes really sense when we have a predict_batch_json method implemented ... n_partitions = max(1, round(len(ddf_mapped) / config.batch_size)) apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch def annotate_batch(df: pd.DataFrame): """Applies data annotation at batch level""" input_batch = df.to_dict(orient="records") predictions = apply_func(input_batch) return pd.Series(map(sanitize, predictions), index=df.index) # a persist is necessary here, otherwise it fails for n_partitions == 1 # the reason is that with only 1 partition we pass on a generator to predict_batch_json ddf_mapped: dd.DataFrame = ddf_mapped.repartition( npartitions=n_partitions).persist() ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch, meta=(None, object)) ddf_source = (data_source.to_dataframe().repartition( npartitions=n_partitions).persist()) # Keep as metadata only non used values/columns ddf_source = ddf_source[[ c for c in ddf_source.columns if c not in ddf_mapped.columns ]] ddf_mapped["metadata"] = ddf_source.map_partitions( lambda df: helpers.stringify(sanitize(df.to_dict(orient="records")))) ddf = DaskElasticClient(host=elasticsearch.es_host, retry_on_timeout=True, http_compress=True).save( ddf_mapped, index=elasticsearch.es_index, doc_type=elasticsearch.es_doc) elasticsearch.create_explore_data_index(force_delete=config.force_delete) elasticsearch.create_explore_data_record({ **(config.metadata or {}), "datasource": data_source.source, # TODO: This should change when ui is normalized (action detail and action link naming) "explore_name": elasticsearch.es_index, "model": pipeline.name, "columns": ddf.columns.values.tolist(), "metadata_columns": data_source.to_dataframe().columns.values.tolist(), "pipeline": pipeline.type_name, "output": pipeline.output, "inputs": pipeline.inputs, # backward compatibility "signature": pipeline.inputs + [pipeline.output], "predict_signature": pipeline.inputs, "labels": pipeline.head.labels, "task": pipeline.head.task_name().as_string(), }) return ddf.persist()
def predict_json(self, inputs: JsonDict) -> JsonDict: """ Create instance(s) after predicting the format. One sentence containing multiple verbs will lead to multiple instances. Expects JSON that looks like `{"sentence": "..."}` Returns a JSON that looks like: ``` {"tokens": [...], "tag_spans": [{"ARG0": "...", "V": "...", "ARG1": "...", ...}]} ``` """ sent_tokens = self._tokenizer.tokenize(inputs["sentence"]) # Find all verbs in the input sentence pred_ids = [ i for (i, t) in enumerate(sent_tokens) if t.pos_ == "VERB" or ( self._language.startswith("en_") and t.pos_ == "AUX") ] # Create instances instances = [ self._json_to_instance({ "sentence": sent_tokens, "predicate_index": pred_id }) for pred_id in pred_ids ] # Run model outputs = [[ sanitize_label(label) for label in self._model.forward_on_instance(instance)["tags"] ] for instance in instances] # Consolidate predictions pred_dict = consolidate_predictions(outputs, sent_tokens) # Build and return output dictionary results = {"verbs": [], "words": sent_tokens} for tags in pred_dict.values(): # Join multi-word predicates tags = join_mwp(tags) # Create description text description = make_oie_string(sent_tokens, tags) # Add a predicate prediction to the return dictionary. results["verbs"].append({ "verb": get_predicate_text(sent_tokens, tags), "description": description, "tags": tags, }) return sanitize(results)
def attack_from_json( self, inputs: JsonDict, input_field_to_attack: str = "tokens", grad_input_field: str = "grad_input_1", ignore_tokens: List[str] = None, target: JsonDict = None, ) -> JsonDict: """ Replaces one token at a time from the input until the model's prediction changes. `input_field_to_attack` is for example `tokens`, it says what the input field is called. `grad_input_field` is for example `grad_input_1`, which is a key into a grads dictionary. The method computes the gradient w.r.t. the tokens, finds the token with the maximum gradient (by L2 norm), and replaces it with another token based on the first-order Taylor approximation of the loss. This process is iteratively repeated until the prediction changes. Once a token is replaced, it is not flipped again. # Parameters inputs : `JsonDict` The model inputs, the same as what is passed to a `Predictor`. input_field_to_attack : `str`, optional (default=`'tokens'`) The field that has the tokens that we're going to be flipping. This must be a `TextField`. grad_input_field : `str`, optional (default=`'grad_input_1'`) If there is more than one field that gets embedded in your model (e.g., a question and a passage, or a premise and a hypothesis), this tells us the key to use to get the correct gradients. This selects from the output of :func:`Predictor.get_gradients`. ignore_tokens : `List[str]`, optional (default=`DEFAULT_IGNORE_TOKENS`) These tokens will not be flipped. The default list includes some simple punctuation, OOV and padding tokens, and common control tokens for BERT, etc. target : `JsonDict`, optional (default=`None`) If given, this will be a `targeted` hotflip attack, where instead of just trying to change a model's prediction from what it current is predicting, we try to change it to a `specific` target value. This is a `JsonDict` because it needs to specify the field name and target value. For example, for a masked LM, this would be something like `{"words": ["she"]}`, because `"words"` is the field name, there is one mask token (hence the list of length one), and we want to change the prediction from whatever it was to `"she"`. """ instance = self.predictor._json_to_instance(inputs) if target is None: output_dict = self.predictor._model.forward_on_instance(instance) else: output_dict = target # This now holds the predictions that we want to change (either away from or towards, # depending on whether `target` was passed). We'll use this in the loop below to check for # when we've met our stopping criterion. original_instances = self.predictor.predictions_to_labeled_instances(instance, output_dict) # This is just for ease of access in the UI, so we know the original tokens. It's not used # in the logic below. original_text_field: TextField = original_instances[0][ # type: ignore input_field_to_attack ] original_tokens = deepcopy(original_text_field.tokens) final_tokens = [] final_outputs = [] # `original_instances` is a list because there might be several different predictions that # we're trying to attack (e.g., all of the NER tags for an input sentence). We attack them # one at a time. for instance in original_instances: tokens, outputs = self.attack_instance( instance=instance, inputs=inputs, input_field_to_attack=input_field_to_attack, grad_input_field=grad_input_field, ignore_tokens=ignore_tokens, target=target, ) final_tokens.append(tokens) final_outputs.append(outputs) return sanitize( {"final": final_tokens, "original": original_tokens, "outputs": final_outputs} )
def _add_output(mod, _, outputs): if idx in results: prev = torch.Tensor(results[idx]) outputs = torch.cat((prev, outputs), dim=0) outputs = outputs.unsqueeze(1) results[idx] = {"name": str(mod), "output": sanitize(outputs)}
def predict_instance(self, instance: Instance) -> JsonDict: new_instance = deepcopy(instance) outputs = self._model.forward_on_instance(new_instance) outputs['paper_id'] = instance['paper_id'].label return sanitize(outputs)
def predict_batch_tokens(self, tokens_list): instances = [self.predictor._dataset_reader.text_to_instance(toks, tags) for toks, tags in tokens_list] outputs = self.predictor._model.forward_on_instances(instances) return sanitize(outputs)
def predict_tokens(self, tokens): instance = self.predictor._dataset_reader.text_to_instance(tokens[0], tokens[1]) output = self.predictor._model.forward_on_instance(instance) return sanitize(output)
def predict_raw(self, sentence): instance = self.predictor._json_to_instance({'sentence': sentence}) output = self.predictor._model.forward_on_instance(instance) return sanitize(output)
def predict_instance(self, instance: Instance) -> JsonDict: if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]: # Handle cases where the labels are present in the test set but not training set self._predict_unknown(instance) outputs = self._model.forward_on_instance(instance) return sanitize(outputs)
# fill batch with instances batch = [] while len(batch) < BATCH_SIZE: try: instance = next(it) except StopIteration: outer_loop = False break batch.append(instance) # get SRL predictions for batch of instances res = predictor._model.forward_on_instances(batch) # make a line for each instance for d in sanitize(res): tags = d['tags'] words = d['words'] # sometimes there is no B-V if 'B-V' not in tags: num_no_verb += 1 continue # sometimes there is only a verb but no arguments (e.g. auxiliary word) - skip if not [tag for tag in tags if 'ARG' in tag]: num_only_verb += 1 continue # make line verb_index = tags.index('B-V')
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) outputs['texts'] = [str(token) for token in instance.fields['tokens'].tokens] outputs['data_id'] = [str(token) for token in instance.fields['data_id'].tokens] return sanitize(outputs)
def _add_output(mod, _, outputs): results[idx] = {"name": str(mod), "output": sanitize(outputs)}
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence = [ self._sentence_to_srl_instances(json) for json in inputs ] flattened_instances = [ instance for sentence_instances in instances_per_sentence for instance in sentence_instances ] if not flattened_instances: return sanitize([{ "verbs": [], "words": self._tokenizer.split_words(x["sentence"]) } for x in inputs]) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [ instance for instance in batched_instances[-1] if instance is not None ] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances(batch)) verbs_per_sentence = [len(sent) for sent in instances_per_sentence] return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs] output_index = 0 for sentence_index, verb_count in enumerate(verbs_per_sentence): if verb_count == 0: # We didn't run any predictions for sentences with no verbs, # so we don't have a way to extract the original sentence. # Here we just tokenize the input again. original_text = self._tokenizer.split_words( inputs[sentence_index]["sentence"]) return_dicts[sentence_index]["words"] = original_text continue for _ in range(verb_count): output = outputs[output_index] words = output["words"] tags = output['tags'] description = self.make_srl_string(words, tags) return_dicts[sentence_index]["words"] = words return_dicts[sentence_index]["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) output_index += 1 return sanitize(return_dicts)
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) outputs['answer'] = self._execute_logical_form_on_table( outputs['logical_form'], outputs['original_table']) return sanitize(outputs)
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) ret_dict = eds_trans_outputs_into_mrp(self.pv_predictor, self.vocab_dict, outputs) return sanitize(ret_dict)
async def explain(inputs: Dict[str, Any]): with http_error_handling(): return sanitize(pipeline.explain(**inputs))
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: outputs = self._model.forward_on_instances(instances) return sanitize(outputs)
def test_sanitize(self): assert util.sanitize(torch.Tensor([1, 2])) == [1, 2] assert util.sanitize(torch.LongTensor([1, 2])) == [1, 2]
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) del outputs["logits"] del outputs["class_probabilities"] return sanitize(outputs)
def process(token_ids): temp = " ".join(token_ids) temp = temp.replace(" ##", "") temp = temp.replace("[unused1]", "( ") temp = temp.replace("[unused2]", " ; ") temp = temp.replace("[unused3]", "") temp = temp.replace("[unused4]", " ; ") temp = temp.replace("[unused5]", "") temp = temp.replace("[unused6]", " )") temp = temp.strip() temp = temp.split("[SEP]") ans = [] for x in temp: if x != "": ans.append(x) return ans archive = load_archive("models/imojie", weights_file="models/imojie/model_state_epoch_7.th", cuda_device=-1) predictor = Predictor.from_archive(archive, "noie_seq2seq") inp_sent = 'I ate an apple and an orange' inp_instance = predictor._dataset_reader.text_to_instance(inp_sent) output = predictor._model.forward_on_instance(inp_instance) output = sanitize(output) output = process(output["predicted_tokens"][0]) print(output)
def evaluate( model: Model, data_loader: DataLoader, cuda_device: Union[int, torch.device] = -1, batch_weight_key: str = None, output_file: str = None, predictions_output_file: str = None, ) -> Dict[str, Any]: """ # Parameters model : `Model` The model to evaluate data_loader : `DataLoader` The `DataLoader` that will iterate over the evaluation data (data loaders already contain their data). cuda_device : `Union[int, torch.device]`, optional (default=`-1`) The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. batch_weight_key : `str`, optional (default=`None`) If given, this is a key in the output dictionary for each batch that specifies how to weight the loss for that batch. If this is not given, we use a weight of 1 for every batch. metrics_output_file : `str`, optional (default=`None`) Optional path to write the final metrics to. predictions_output_file : `str`, optional (default=`None`) Optional path to write the predictions to. # Returns `Dict[str, Any]` The final metrics. """ check_for_gpu(cuda_device) data_loader.set_target_device(int_to_device(cuda_device)) predictions_file = (None if predictions_output_file is None else open( predictions_output_file, "w")) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics): logger.warning('Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = (", ".join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||") generator_tqdm.set_description(description, refresh=False) if predictions_file is not None: predictions = json.dumps( sanitize(model.make_output_human_readable(output_dict))) predictions_file.write(predictions + "\n") if predictions_file is not None: predictions_file.close() final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes produced a loss!" ) final_metrics["loss"] = total_loss / total_weight if output_file is not None: dump_metrics(output_file, final_metrics, log=True) return final_metrics
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs] flattened_instances = [instance for sentence_instances in instances_per_sentence for instance in sentence_instances] if not flattened_instances: return sanitize([{"verbs": [], "words": self._tokenizer.split_words(x["sentence"])} for x in inputs]) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [instance for instance in batched_instances[-1] if instance is not None] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances(batch)) verbs_per_sentence = [len(sent) for sent in instances_per_sentence] return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs] output_index = 0 for sentence_index, verb_count in enumerate(verbs_per_sentence): if verb_count == 0: # We didn't run any predictions for sentences with no verbs, # so we don't have a way to extract the original sentence. # Here we just tokenize the input again. original_text = self._tokenizer.split_words(inputs[sentence_index]["sentence"]) return_dicts[sentence_index]["words"] = original_text continue for _ in range(verb_count): output = outputs[output_index] words = output["words"] tags = output['tags'] description = self.make_srl_string(words, tags) return_dicts[sentence_index]["words"] = words return_dicts[sentence_index]["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) output_index += 1 return sanitize(return_dicts)
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: for instance in instances: self._dataset_reader.apply_token_indexers(instance) outputs = self._model.forward_on_instances(instances) return sanitize(outputs)
def predict_batch(self, sents: List[List[str]], batch_size: int = 256, warm_up: int = 0, beam_search: int = 1) -> JsonDict: sents_token = [self._tag_tokens(sent) for sent in sents] instances, insts_st, insts_ed = [], [], [] # find all verbs in the input sentence for sent_token in sents_token: pred_ids = [ i for (i, t) in enumerate(sent_token) if t.pos_ == 'VERB' ] insts_st.append(len(instances)) instances.extend([ self._json_to_instance({ 'sentence': sent_token, 'predicate_index': pid }) for pid in pred_ids ]) insts_ed.append(len(instances)) # warm up the model using warm_up batch (mainly because of non-determinism of ELMo) if warm_up: for b in range(0, min(warm_up * batch_size, len(instances)), batch_size): batch_inst = instances[b:b + batch_size] self._model.forward_on_instances(batch_inst) # run model outputs, probs = [], [] for b in range(0, len(instances), batch_size): batch_inst = instances[b:b + batch_size] for prediction in self._model.forward_on_instances(batch_inst): all_tags, all_probs = self._beam_search( prediction['class_probabilities'], prediction['mask'], n_best=beam_search) outputs.append(all_tags) probs.append(all_probs) results_li = [] for sent_token, st, ed in zip(sents_token, insts_st, insts_ed): # consolidate predictions cur_o = [e for o in outputs[st:ed] for e in o] cur_p = [e for o in probs[st:ed] for e in o] # Build and return output dictionary results = { 'verbs': [], 'words': [token.text for token in sent_token] } for tags, prob in zip(cur_o, cur_p): # create description text description = make_oie_string(sent_token, tags) # add a predicate prediction to the return dictionary results['verbs'].append({ 'verb': get_predicate_text(sent_token, tags), 'description': description, 'tags': tags, 'probs': prob, }) results_li.append(results) return sanitize(results_li)
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: outputs = self._model.forward_on_instances(instances) for output in outputs: output['answer'] = self._execute_logical_form_on_table(output['logical_form'], output['original_table']) return sanitize(outputs)
def attack_from_json(self, inputs: JsonDict, input_field_to_attack: str = 'tokens', grad_input_field: str = 'grad_input_1', ignore_tokens: List[str] = None, target: JsonDict = None) -> JsonDict: """ Replaces one token at a time from the input until the model's prediction changes. ``input_field_to_attack`` is for example ``tokens``, it says what the input field is called. ``grad_input_field`` is for example ``grad_input_1``, which is a key into a grads dictionary. The method computes the gradient w.r.t. the tokens, finds the token with the maximum gradient (by L2 norm), and replaces it with another token based on the first-order Taylor approximation of the loss. This process is iteratively repeated until the prediction changes. Once a token is replaced, it is not flipped again. Parameters ---------- inputs : ``JsonDict`` The model inputs, the same as what is passed to a ``Predictor``. input_field_to_attack : ``str``, optional (default='tokens') The field that has the tokens that we're going to be flipping. This must be a ``TextField``. grad_input_field : ``str``, optional (default='grad_input_1') If there is more than one field that gets embedded in your model (e.g., a question and a passage, or a premise and a hypothesis), this tells us the key to use to get the correct gradients. This selects from the output of :func:`Predictor.get_gradients`. ignore_tokens : ``List[str]``, optional (default=DEFAULT_IGNORE_TOKENS) These tokens will not be flipped. The default list includes some simple punctuation, OOV and padding tokens, and common control tokens for BERT, etc. target : ``JsonDict``, optional (default=None) If given, this will be a `targeted` hotflip attack, where instead of just trying to change a model's prediction from what it current is predicting, we try to change it to a `specific` target value. This is a ``JsonDict`` because it needs to specify the field name and target value. For example, for a masked LM, this would be something like ``{"words": ["she"]}``, because ``"words"`` is the field name, there is one mask token (hence the list of length one), and we want to change the prediction from whatever it was to ``"she"``. """ if self.token_embedding is None: self.initialize() ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens # If `target` is `None`, we move away from the current prediction, otherwise we move # _towards_ the target. sign = -1 if target is None else 1 instance = self.predictor._json_to_instance(inputs) if target is None: output_dict = self.predictor._model.forward_on_instance(instance) else: output_dict = target # This now holds the predictions that we want to change (either away from or towards, # depending on whether `target` was passed). We'll use this in the loop below to check for # when we've met our stopping criterion. original_instances = self.predictor.predictions_to_labeled_instances( instance, output_dict) # This is just for ease of access in the UI, so we know the original tokens. It's not used # in the logic below. original_text_field: TextField = original_instances[0][ input_field_to_attack] # type: ignore original_tokens = deepcopy(original_text_field.tokens) final_tokens = [] # `original_instances` is a list because there might be several different predictions that # we're trying to attack (e.g., all of the NER tags for an input sentence). We attack them # one at a time. for instance in original_instances: # Gets a list of the fields that we want to check to see if they change. fields_to_compare = utils.get_fields_to_compare( inputs, instance, input_field_to_attack) # We'll be modifying the tokens in this text field below, and grabbing the modified # list after the `while` loop. text_field: TextField = instance[ input_field_to_attack] # type: ignore # Because we can save computation by getting grads and outputs at the same time, we do # them together at the end of the loop, even though we use grads at the beginning and # outputs at the end. This is our initial gradient for the beginning of the loop. The # output can be ignored here. grads, outputs = self.predictor.get_gradients([instance]) # Ignore any token that is in the ignore_tokens list by setting the token to already # flipped. flipped: List[int] = [] for index, token in enumerate(text_field.tokens): if token.text in ignore_tokens: flipped.append(index) if 'clusters' in outputs: # Coref unfortunately needs a special case here. We don't want to flip words in # the same predicted coref cluster, but we can't really specify a list of tokens, # because, e.g., "he" could show up in several different clusters. # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to # return the set of tokens that shouldn't be changed for each instance? E.g., you # could imagine setting a field on the `Token` object, that we could then read # here... for cluster in outputs['clusters']: for mention in cluster: for index in range(mention[0], mention[1] + 1): flipped.append(index) while True: # Compute L2 norm of all grads. grad = grads[grad_input_field] grads_magnitude = [g.dot(g) for g in grad] # only flip a token once for index in flipped: grads_magnitude[index] = -1 # We flip the token with highest gradient norm. index_of_token_to_flip = numpy.argmax(grads_magnitude) if grads_magnitude[index_of_token_to_flip] == -1: # If we've already flipped all of the tokens, we give up. break flipped.append(index_of_token_to_flip) # TODO(mattg): This is quite a bit of a hack, both for gpt2 and for getting the # vocab id in general... I don't have better ideas at the moment, though. indexer_name = 'tokens' if self.namespace == 'gpt2' else self.namespace input_tokens = text_field._indexed_tokens[indexer_name] original_id_of_token_to_flip = input_tokens[ index_of_token_to_flip] # Get new token using taylor approximation. new_id = self._first_order_taylor( grad[index_of_token_to_flip], self.token_embedding.weight, # type: ignore original_id_of_token_to_flip, sign) # Flip token. We need to tell the instance to re-index itself, so the text field # will actually update. new_token = Token(self.vocab._index_to_token[self.namespace] [new_id]) # type: ignore text_field.tokens[index_of_token_to_flip] = new_token instance.indexed = False # Get model predictions on instance, and then label the instances grads, outputs = self.predictor.get_gradients( [instance]) # predictions for key, output in outputs.items(): if isinstance(output, torch.Tensor): outputs[key] = output.detach().cpu().numpy().squeeze() elif isinstance(output, list): outputs[key] = output[0] # TODO(mattg): taking the first result here seems brittle, if we're in a case where # there are multiple predictions. labeled_instance = self.predictor.predictions_to_labeled_instances( instance, outputs)[0] # If we've met our stopping criterion, we stop. has_changed = utils.instance_has_changed( labeled_instance, fields_to_compare) if target is None and has_changed: # With no target, we just want to change the prediction. break if target is not None and not has_changed: # With a given target, we want to *match* the target, which we check by # `not has_changed`. break final_tokens.append(text_field.tokens) return sanitize({ "final": final_tokens, "original": original_tokens, "outputs": outputs })
def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) outputs['answer'] = self._execute_logical_form_on_table(outputs['logical_form'], outputs['original_table']) return sanitize(outputs)