Пример #1
0
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Expects JSON that looks like ``{"sentence": "..."}``
        and returns JSON that looks like

        .. code-block:: js

            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}
        """
        instances = self._sentence_to_srl_instances(inputs)

        if not instances:
            return sanitize({"verbs": [], "words": self._tokenizer.split_words(inputs["sentence"])})

        outputs = self._model.forward_on_instances(instances)

        results = {"verbs": [], "words": outputs[0]["words"]}
        for output in outputs:
            tags = output['tags']
            description = self.make_srl_string(output["words"], tags)
            results["verbs"].append({
                    "verb": output["verb"],
                    "description": description,
                    "tags": tags,
            })

        return sanitize(results)
Пример #2
0
    def test_sanitize(self):
        assert util.sanitize(torch.Tensor([1, 2])) == [1, 2]
        assert util.sanitize(torch.LongTensor([1, 2])) == [1, 2]

        with pytest.raises(ValueError):
            util.sanitize(Unsanitizable())

        assert util.sanitize(Sanitizable()) == {"sanitizable": True}
Пример #3
0
    def predict_tokenized(self, tokenized_sentence: List[str]) -> JsonDict:
        """
        Predicts the semantic roles of the supplied sentence tokens and returns a dictionary
        with the results.

        Parameters
        ----------
        tokenized_sentence, ``List[str]``
            The sentence tokens to parse via semantic role labeling.

        Returns
        -------
        A dictionary representation of the semantic roles in the sentence.
        """
        spacy_doc = self._tokenizer.spacy.tokenizer.tokens_from_list(tokenized_sentence)
        for pipe in filter(None, self._tokenizer.spacy.pipeline):
            pipe[1](spacy_doc)

        tokens = [token for token in spacy_doc]
        instances = self.tokens_to_instances(tokens)

        if not instances:
            return sanitize({"verbs": [], "words": tokens})

        return self.predict_instances(instances)
Пример #4
0
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        instance, return_dict = self._my_json_to_instance(inputs)
        world = instance.fields['world'].metadata  # type: ignore
        outputs = self._model.forward_on_instance(instance)

        answer_index = outputs['answer_index']
        if answer_index == 0:
            answer = "A"
        elif answer_index == 1:
            answer = "B"
        else:
            answer = "None"
        outputs['answer'] = answer

        return_dict.update(outputs)

        if answer != "None":
            explanation = get_explanation(return_dict['logical_form'],
                                          return_dict['world_extractions'],
                                          answer_index,
                                          world)
        else:
            explanation = [{"header": "No consistent interpretation found!", "content": []}]

        return_dict['explanation'] = explanation
        return sanitize(return_dict)
Пример #5
0
 def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
     outputs = self._model.forward_on_instances(instances)
     for output in outputs:
         # format the NLTK tree as a string on a single line.
         tree = output.pop("trees")
         output["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
         output["trees"] = tree.pformat(margin=1000000)
     return sanitize(outputs)
Пример #6
0
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        # format the NLTK tree as a string on a single line.
        tree = outputs.pop("trees")
        outputs["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
        outputs["trees"] = tree.pformat(margin=1000000)
        return sanitize(outputs)
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        words = outputs["words"]
        pos = outputs["pos"]
        heads = outputs["predicted_heads"]
        tags = outputs["predicted_dependencies"]
        outputs["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
        return sanitize(outputs)
 def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
     outputs = self._model.forward_on_instances(instances)
     for output in outputs:
         words = output["words"]
         pos = output["pos"]
         heads = output["predicted_heads"]
         tags = output["predicted_dependencies"]
         output["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
     return sanitize(outputs)
    def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict:
        instance, return_dict = self._json_to_instance(inputs)
        outputs = self._model.forward_on_instance(instance, cuda_device)
        return_dict.update(outputs)

        # format the NLTK tree as a string on a single line.
        tree = return_dict.pop("trees")
        return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
        return_dict["trees"] = tree.pformat(margin=1000000)
        return sanitize(return_dict)
Пример #10
0
 def predict_batch_json(self, inputs: List[JsonDict], cuda_device: int = -1) -> List[JsonDict]:
     instances, return_dicts = zip(*self._batch_json_to_instances(inputs))
     outputs = self._model.forward_on_instances(instances, cuda_device)
     for output, return_dict in zip(outputs, return_dicts):
         return_dict.update(output)
         # format the NLTK tree as a string on a single line.
         tree = return_dict.pop("trees")
         return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
         return_dict["trees"] = tree.pformat(margin=1000000)
     return sanitize(return_dicts)
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Create instance(s) after predicting the format. One sentence containing multiple verbs
        will lead to multiple instances.

        Expects JSON that looks like ``{"sentence": "..."}``

        Returns a JSON that looks like

        .. code-block:: js

            {"tokens": [...],
             "tag_spans": [{"ARG0": "...",
                            "V": "...",
                            "ARG1": "...",
                             ...}]}
        """
        sent_tokens = self._tokenizer.tokenize(inputs["sentence"])

        # Find all verbs in the input sentence
        pred_ids = [i for (i, t)
                    in enumerate(sent_tokens)
                    if t.pos_ == "VERB"]

        # Create instances
        instances = [self._json_to_instance({"sentence": sent_tokens,
                                             "predicate_index": pred_id})
                     for pred_id in pred_ids]

        # Run model
        outputs = [[sanitize_label(label) for label in self._model.forward_on_instance(instance)["tags"]]
                   for instance in instances]

        # Consolidate predictions
        pred_dict = consolidate_predictions(outputs, sent_tokens)

        # Build and return output dictionary
        results = {"verbs": [], "words": sent_tokens}

        for tags in pred_dict.values():
            # Join multi-word predicates
            tags = join_mwp(tags)

            # Create description text
            description = make_oie_string(sent_tokens, tags)

            # Add a predicate prediction to the return dictionary.
            results["verbs"].append({
                    "verb": get_predicate_text(sent_tokens, tags),
                    "description": description,
                    "tags": tags,
            })

        return sanitize(results)
Пример #12
0
    def predict_instances(self, instances: List[Instance]) -> JsonDict:
        outputs = self._model.forward_on_instances(instances)

        results = {"verbs": [], "words": outputs[0]["words"]}
        for output in outputs:
            tags = output['tags']
            description = self.make_srl_string(output["words"], tags)
            results["verbs"].append({
                    "verb": output["verb"],
                    "description": description,
                    "tags": tags,
            })

        return sanitize(results)
Пример #13
0
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Expects JSON that looks like ``{"sentence": "..."}``
        and returns JSON that looks like

        .. code-block:: js

            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}
        """
        instances = self._sentence_to_srl_instances(inputs)

        if not instances:
            return sanitize({"verbs": [], "words": self._tokenizer.split_words(inputs["sentence"])})

        return self.predict_instances(instances)
Пример #14
0
    def attack_from_json(
        self,
        inputs: JsonDict,
        field_to_change: str = 'tokens',
        field_to_attack: str = 'label',
        grad_input_field: str = 'grad_input_1',
    ) -> JsonDict:
        raw_instance = self.predictor.json_to_labeled_instances(inputs)[0]
        raw_text_field: TextField = raw_instance[field_to_change]
        raw_tokens = raw_text_field.tokens

        adv_instance = deepcopy(raw_instance)
        adv_text_field: TextField = adv_instance[field_to_change]
        adv_tokens = adv_text_field.tokens

        # equal to the raw one...
        _, outputs = self.predictor.get_gradients([adv_instance])

        # set up some states
        change_positions__ = set()
        forbidden_idxs__ = set()
        for forbidden_token in self.forbidden_tokens:
            if forbidden_token in self.vocab._token_to_index['tokens']:
                forbidden_idxs__.add(
                    self.vocab._token_to_index['tokens'][forbidden_token])

        successful = False
        for step in range(self.max_step):
            grads, _ = self.predictor.get_gradients([adv_instance])
            grad = torch.from_numpy(grads[grad_input_field][0]).to(
                self.model_device)
            grad_norm = grad.norm(dim=-1)

            position_mask = [False for _ in range(len(adv_tokens))]
            is_max_changed = len(change_positions__) > self.max_change_num(
                len(raw_tokens))

            for idx, token in enumerate(adv_tokens):
                if token.text in self.ignore_tokens:
                    position_mask[idx] = True
                if is_max_changed and idx not in change_positions__:
                    position_mask[idx] = True
            if all(position_mask):
                print("All words are forbidden.")
                break
            for idx in range(len(position_mask)):
                if position_mask[idx]:
                    grad_norm[idx] = -1

            # select a word and forbid itself
            token_vids: List[int] = []
            new_token_vids: List[int] = []

            _, topk_idxs = grad_norm.sort(descending=True)
            token_sids = select(ordered_idxs=cast_list(topk_idxs),
                                num_to_select=self.iter_change_num,
                                selected=change_positions__,
                                max_num=self.max_change_num(len(raw_tokens)))
            token_sids = [
                ele for ele in token_sids if position_mask[ele] is False
            ]

            for token_sid in token_sids:
                token_grad = grad[token_sid]

                token_vid = adv_text_field._indexed_tokens["tokens"][token_sid]

                token_emb = self.token_embedding[token_vid]

                change_positions__.add(token_sid)
                forbidden_idxs__.add(token_vid)

                #                 print(change_positions__)

                delta = token_grad / torch.norm(token_grad) * self.step_size
                new_token_emb = token_emb + delta

                tk_vals, tk_idxs = self.embed_searcher.find_neighbours(
                    new_token_emb, 'cos', topk=None, rho=None)
                for tk_idx in cast_list(tk_idxs):
                    if tk_idx in forbidden_idxs__:
                        continue
                    else:
                        new_token_vid = tk_idx
                        break

                token_vids.append(token_vid)
                new_token_vids.append(new_token_vid)

                # flip token
                new_token = Token(self.vocab._index_to_token["tokens"]
                                  [new_token_vid])  # type: ignore
                adv_text_field.tokens[token_sid] = new_token

            adv_instance.indexed = False

            # Get model predictions on current_instance, and then label the instances
            grads, outputs = self.predictor.get_gradients([adv_instance
                                                           ])  # predictions
            for key, output in outputs.items():
                outputs[key] = cast_list(outputs[key])

            # add labels to current_instances
            current_instance_labeled = self.predictor.predictions_to_labeled_instances(
                adv_instance, outputs)[0]
            # if the prediction has changed, then stop
            if current_instance_labeled[field_to_attack] != raw_instance[
                    field_to_attack]:
                successful = True
                break

        return sanitize({
            "adv": adv_tokens,
            "raw": raw_tokens,
            "outputs": outputs,
            "success": 1 if successful else 0
        })
Пример #15
0
 async def predict(inputs: Dict[str, Any]):
     with http_error_handling():
         return sanitize(pipeline.predict(**inputs))
Пример #16
0
 def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
     instances, return_dicts = zip(*self._batch_json_to_instances(inputs))
     outputs = self._model.forward_on_instances(instances)
     for output, return_dict in zip(outputs, return_dicts):
         return_dict.update(output)
     return sanitize(return_dicts)
Пример #17
0
 def predict_instance(self, instance: Instance) -> JsonDict:
     outputs = self._model.forward_on_instance(instance)
     return sanitize(outputs)
Пример #18
0
    def predict_batch_json(self,
                           inputs: List[JsonDict],
                           cuda_device: int = -1) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence, return_dicts = zip(
            *[self._sentence_to_srl_instances(json) for json in inputs])

        flattened_instances = [
            instance for sentence_instances in instances_per_sentence
            for instance in sentence_instances
        ]

        if not flattened_instances:
            return sanitize(return_dicts)

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size,
                                           None)
        batched_instances[-1] = [
            instance for instance in batched_instances[-1]
            if instance is not None
        ]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(
                batch, cuda_device))

        sentence_index = 0
        for results in return_dicts:
            # We just added the verbs to the list in _sentence_to_srl_instances
            # but we actually want to replace them with their frames, so we
            # reset them here.
            verbs_for_sentence: List[str] = results["verbs"]
            results["verbs"] = []
            # The verbs are in order, but nested as we have multiple sentences.
            # The outputs are already flattened from running through the model,
            # so we just index into this flat list for each verb, updating as we go.
            for verb in verbs_for_sentence:
                output = outputs[sentence_index]
                tags = output['tags']
                description = self.make_srl_string(results["words"], tags)
                results["verbs"].append({
                    "verb": verb,
                    "description": description,
                    "tags": tags,
                })
                sentence_index += 1

        return sanitize(return_dicts)
Пример #19
0
 def predict_json(self, inputs: JsonDict) -> JsonDict:
     instance = self._json_to_instance(inputs)
     outputs = self._model.forward_on_instance(instance)
     return sanitize(outputs)
Пример #20
0
def _explore(
    pipeline: Pipeline,
    data_source: DataSource,
    config: ExploreConfiguration,
    elasticsearch: ElasticsearchExplore,
) -> dd.DataFrame:
    """
    Executes a pipeline prediction over a datasource and register results int a elasticsearch index

    Parameters
    ----------
    pipeline
    data_source
    config
    elasticsearch

    Returns
    -------

    """
    if config.prediction_cache > 0:
        pipeline.init_prediction_cache(config.prediction_cache)

    ddf_mapped = data_source.to_mapped_dataframe()
    # Stringify input data for better elasticsearch index mapping integration,
    # avoiding properties with multiple value types (string and long,...)
    for column in ddf_mapped.columns:
        ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify)

    # this only makes really sense when we have a predict_batch_json method implemented ...
    n_partitions = max(1, round(len(ddf_mapped) / config.batch_size))

    apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch

    def annotate_batch(df: pd.DataFrame):
        """Applies data annotation at batch level"""
        input_batch = df.to_dict(orient="records")
        predictions = apply_func(input_batch)
        return pd.Series(map(sanitize, predictions), index=df.index)

    # a persist is necessary here, otherwise it fails for n_partitions == 1
    # the reason is that with only 1 partition we pass on a generator to predict_batch_json
    ddf_mapped: dd.DataFrame = ddf_mapped.repartition(
        npartitions=n_partitions).persist()
    ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch,
                                                         meta=(None, object))

    ddf_source = (data_source.to_dataframe().repartition(
        npartitions=n_partitions).persist())
    # Keep as metadata only non used values/columns
    ddf_source = ddf_source[[
        c for c in ddf_source.columns if c not in ddf_mapped.columns
    ]]
    ddf_mapped["metadata"] = ddf_source.map_partitions(
        lambda df: helpers.stringify(sanitize(df.to_dict(orient="records"))))

    ddf = DaskElasticClient(host=elasticsearch.es_host,
                            retry_on_timeout=True,
                            http_compress=True).save(
                                ddf_mapped,
                                index=elasticsearch.es_index,
                                doc_type=elasticsearch.es_doc)

    elasticsearch.create_explore_data_index(force_delete=config.force_delete)
    elasticsearch.create_explore_data_record({
        **(config.metadata or {}),
        "datasource":
        data_source.source,
        # TODO: This should change when ui is normalized (action detail and action link naming)
        "explore_name":
        elasticsearch.es_index,
        "model":
        pipeline.name,
        "columns":
        ddf.columns.values.tolist(),
        "metadata_columns":
        data_source.to_dataframe().columns.values.tolist(),
        "pipeline":
        pipeline.type_name,
        "output":
        pipeline.output,
        "inputs":
        pipeline.inputs,  # backward compatibility
        "signature":
        pipeline.inputs + [pipeline.output],
        "predict_signature":
        pipeline.inputs,
        "labels":
        pipeline.head.labels,
        "task":
        pipeline.head.task_name().as_string(),
    })
    return ddf.persist()
Пример #21
0
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Create instance(s) after predicting the format. One sentence containing multiple verbs
        will lead to multiple instances.

        Expects JSON that looks like `{"sentence": "..."}`

        Returns a JSON that looks like:

        ```
        {"tokens": [...],
         "tag_spans": [{"ARG0": "...",
                        "V": "...",
                        "ARG1": "...",
                         ...}]}
        ```
        """
        sent_tokens = self._tokenizer.tokenize(inputs["sentence"])

        # Find all verbs in the input sentence
        pred_ids = [
            i for (i, t) in enumerate(sent_tokens) if t.pos_ == "VERB" or (
                self._language.startswith("en_") and t.pos_ == "AUX")
        ]

        # Create instances
        instances = [
            self._json_to_instance({
                "sentence": sent_tokens,
                "predicate_index": pred_id
            }) for pred_id in pred_ids
        ]

        # Run model
        outputs = [[
            sanitize_label(label)
            for label in self._model.forward_on_instance(instance)["tags"]
        ] for instance in instances]

        # Consolidate predictions
        pred_dict = consolidate_predictions(outputs, sent_tokens)

        # Build and return output dictionary
        results = {"verbs": [], "words": sent_tokens}

        for tags in pred_dict.values():
            # Join multi-word predicates
            tags = join_mwp(tags)

            # Create description text
            description = make_oie_string(sent_tokens, tags)

            # Add a predicate prediction to the return dictionary.
            results["verbs"].append({
                "verb":
                get_predicate_text(sent_tokens, tags),
                "description":
                description,
                "tags":
                tags,
            })

        return sanitize(results)
Пример #22
0
    def attack_from_json(
        self,
        inputs: JsonDict,
        input_field_to_attack: str = "tokens",
        grad_input_field: str = "grad_input_1",
        ignore_tokens: List[str] = None,
        target: JsonDict = None,
    ) -> JsonDict:
        """
        Replaces one token at a time from the input until the model's prediction changes.
        `input_field_to_attack` is for example `tokens`, it says what the input field is
        called.  `grad_input_field` is for example `grad_input_1`, which is a key into a grads
        dictionary.

        The method computes the gradient w.r.t. the tokens, finds the token with the maximum
        gradient (by L2 norm), and replaces it with another token based on the first-order Taylor
        approximation of the loss.  This process is iteratively repeated until the prediction
        changes.  Once a token is replaced, it is not flipped again.

        # Parameters

        inputs : `JsonDict`
            The model inputs, the same as what is passed to a `Predictor`.
        input_field_to_attack : `str`, optional (default=`'tokens'`)
            The field that has the tokens that we're going to be flipping.  This must be a
            `TextField`.
        grad_input_field : `str`, optional (default=`'grad_input_1'`)
            If there is more than one field that gets embedded in your model (e.g., a question and
            a passage, or a premise and a hypothesis), this tells us the key to use to get the
            correct gradients.  This selects from the output of :func:`Predictor.get_gradients`.
        ignore_tokens : `List[str]`, optional (default=`DEFAULT_IGNORE_TOKENS`)
            These tokens will not be flipped.  The default list includes some simple punctuation,
            OOV and padding tokens, and common control tokens for BERT, etc.
        target : `JsonDict`, optional (default=`None`)
            If given, this will be a `targeted` hotflip attack, where instead of just trying to
            change a model's prediction from what it current is predicting, we try to change it to
            a `specific` target value.  This is a `JsonDict` because it needs to specify the
            field name and target value.  For example, for a masked LM, this would be something
            like `{"words": ["she"]}`, because `"words"` is the field name, there is one mask
            token (hence the list of length one), and we want to change the prediction from
            whatever it was to `"she"`.
        """
        instance = self.predictor._json_to_instance(inputs)
        if target is None:
            output_dict = self.predictor._model.forward_on_instance(instance)
        else:
            output_dict = target

        # This now holds the predictions that we want to change (either away from or towards,
        # depending on whether `target` was passed).  We'll use this in the loop below to check for
        # when we've met our stopping criterion.
        original_instances = self.predictor.predictions_to_labeled_instances(instance, output_dict)

        # This is just for ease of access in the UI, so we know the original tokens.  It's not used
        # in the logic below.
        original_text_field: TextField = original_instances[0][  # type: ignore
            input_field_to_attack
        ]
        original_tokens = deepcopy(original_text_field.tokens)

        final_tokens = []
        final_outputs = []
        # `original_instances` is a list because there might be several different predictions that
        # we're trying to attack (e.g., all of the NER tags for an input sentence).  We attack them
        # one at a time.
        for instance in original_instances:
            tokens, outputs = self.attack_instance(
                instance=instance,
                inputs=inputs,
                input_field_to_attack=input_field_to_attack,
                grad_input_field=grad_input_field,
                ignore_tokens=ignore_tokens,
                target=target,
            )
            final_tokens.append(tokens)
            final_outputs.append(outputs)

        return sanitize(
            {"final": final_tokens, "original": original_tokens, "outputs": final_outputs}
        )
Пример #23
0
 def _add_output(mod, _, outputs):
     if idx in results:
         prev = torch.Tensor(results[idx])
         outputs = torch.cat((prev, outputs), dim=0)
     outputs = outputs.unsqueeze(1)
     results[idx] = {"name": str(mod), "output": sanitize(outputs)}
Пример #24
0
 def predict_instance(self, instance: Instance) -> JsonDict:
     new_instance = deepcopy(instance)
     outputs = self._model.forward_on_instance(new_instance)
     outputs['paper_id'] = instance['paper_id'].label
     return sanitize(outputs)
Пример #25
0
 def predict_batch_tokens(self, tokens_list):
     instances = [self.predictor._dataset_reader.text_to_instance(toks, tags) for toks, tags in tokens_list]
     outputs = self.predictor._model.forward_on_instances(instances)
     return sanitize(outputs)
Пример #26
0
 def predict_tokens(self, tokens):
     instance = self.predictor._dataset_reader.text_to_instance(tokens[0], tokens[1])
     output = self.predictor._model.forward_on_instance(instance)
     return sanitize(output)
Пример #27
0
 def predict_raw(self, sentence):
     instance = self.predictor._json_to_instance({'sentence': sentence})
     output = self.predictor._model.forward_on_instance(instance)
     return sanitize(output)
Пример #28
0
 def predict_instance(self, instance: Instance) -> JsonDict:
     if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]:
         # Handle cases where the labels are present in the test set but not training set
         self._predict_unknown(instance)
     outputs = self._model.forward_on_instance(instance)
     return sanitize(outputs)
Пример #29
0
    # fill batch with instances
    batch = []
    while len(batch) < BATCH_SIZE:
        try:
            instance = next(it)
        except StopIteration:
            outer_loop = False
            break
        batch.append(instance)

    # get SRL predictions for batch of instances
    res = predictor._model.forward_on_instances(batch)

    # make a line for each instance
    for d in sanitize(res):
        tags = d['tags']
        words = d['words']

        # sometimes there is no B-V
        if 'B-V' not in tags:
            num_no_verb += 1
            continue

        # sometimes there is only a verb but no arguments (e.g. auxiliary word) - skip
        if not [tag for tag in tags if 'ARG' in tag]:
            num_only_verb += 1
            continue

        # make line
        verb_index = tags.index('B-V')
Пример #30
0
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)
        outputs['texts'] = [str(token) for token in instance.fields['tokens'].tokens]
        outputs['data_id'] = [str(token) for token in instance.fields['data_id'].tokens]

        return sanitize(outputs)
Пример #31
0
 def _add_output(mod, _, outputs):
     results[idx] = {"name": str(mod), "output": sanitize(outputs)}
Пример #32
0
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence = [
            self._sentence_to_srl_instances(json) for json in inputs
        ]

        flattened_instances = [
            instance for sentence_instances in instances_per_sentence
            for instance in sentence_instances
        ]

        if not flattened_instances:
            return sanitize([{
                "verbs": [],
                "words":
                self._tokenizer.split_words(x["sentence"])
            } for x in inputs])

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size,
                                           None)
        batched_instances[-1] = [
            instance for instance in batched_instances[-1]
            if instance is not None
        ]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
        return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]

        output_index = 0
        for sentence_index, verb_count in enumerate(verbs_per_sentence):
            if verb_count == 0:
                # We didn't run any predictions for sentences with no verbs,
                # so we don't have a way to extract the original sentence.
                # Here we just tokenize the input again.
                original_text = self._tokenizer.split_words(
                    inputs[sentence_index]["sentence"])
                return_dicts[sentence_index]["words"] = original_text
                continue

            for _ in range(verb_count):
                output = outputs[output_index]
                words = output["words"]
                tags = output['tags']
                description = self.make_srl_string(words, tags)
                return_dicts[sentence_index]["words"] = words
                return_dicts[sentence_index]["verbs"].append({
                    "verb":
                    output["verb"],
                    "description":
                    description,
                    "tags":
                    tags,
                })
                output_index += 1

        return sanitize(return_dicts)
 def predict_instance(self, instance: Instance) -> JsonDict:
     outputs = self._model.forward_on_instance(instance)
     outputs['answer'] = self._execute_logical_form_on_table(
         outputs['logical_form'], outputs['original_table'])
     return sanitize(outputs)
Пример #34
0
 def predict_instance(self, instance: Instance) -> JsonDict:
     outputs = self._model.forward_on_instance(instance)
     ret_dict = eds_trans_outputs_into_mrp(self.pv_predictor,
                                           self.vocab_dict, outputs)
     return sanitize(ret_dict)
Пример #35
0
 async def explain(inputs: Dict[str, Any]):
     with http_error_handling():
         return sanitize(pipeline.explain(**inputs))
Пример #36
0
 def predict_batch_instance(self,
                            instances: List[Instance]) -> List[JsonDict]:
     outputs = self._model.forward_on_instances(instances)
     return sanitize(outputs)
Пример #37
0
 def predict_instance(self, instance: Instance) -> JsonDict:
     outputs = self._model.forward_on_instance(instance)
     return sanitize(outputs)
Пример #38
0
 def test_sanitize(self):
     assert util.sanitize(torch.Tensor([1, 2])) == [1, 2]
     assert util.sanitize(torch.LongTensor([1, 2])) == [1, 2]
Пример #39
0
 def test_sanitize(self):
     assert util.sanitize(torch.Tensor([1, 2])) == [1, 2]
     assert util.sanitize(torch.LongTensor([1, 2])) == [1, 2]
Пример #40
0
 def predict_instance(self, instance: Instance) -> JsonDict:
     outputs = self._model.forward_on_instance(instance)
     del outputs["logits"]
     del outputs["class_probabilities"]
     return sanitize(outputs)
Пример #41
0
def process(token_ids):
    temp = " ".join(token_ids)
    temp = temp.replace(" ##", "")
    temp = temp.replace("[unused1]", "( ")
    temp = temp.replace("[unused2]", " ; ")
    temp = temp.replace("[unused3]", "")
    temp = temp.replace("[unused4]", " ; ")
    temp = temp.replace("[unused5]", "")
    temp = temp.replace("[unused6]", " )")
    temp = temp.strip()
    temp = temp.split("[SEP]")
    ans = []
    for x in temp:
        if x != "":
            ans.append(x)
    return ans


archive = load_archive("models/imojie",
                       weights_file="models/imojie/model_state_epoch_7.th",
                       cuda_device=-1)

predictor = Predictor.from_archive(archive, "noie_seq2seq")
inp_sent = 'I ate an apple and an orange'
inp_instance = predictor._dataset_reader.text_to_instance(inp_sent)
output = predictor._model.forward_on_instance(inp_instance)
output = sanitize(output)
output = process(output["predicted_tokens"][0])
print(output)
Пример #42
0
def evaluate(
    model: Model,
    data_loader: DataLoader,
    cuda_device: Union[int, torch.device] = -1,
    batch_weight_key: str = None,
    output_file: str = None,
    predictions_output_file: str = None,
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `Union[int, torch.device]`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    metrics_output_file : `str`, optional (default=`None`)
        Optional path to write the final metrics to.
    predictions_output_file : `str`, optional (default=`None`)
        Optional path to write the predictions to.

    # Returns

    `Dict[str, Any]`
        The final metrics.
    """
    check_for_gpu(cuda_device)
    data_loader.set_target_device(int_to_device(cuda_device))
    predictions_file = (None if predictions_output_file is None else open(
        predictions_output_file, "w"))

    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics):
                logger.warning('Metrics with names beginning with "_" will '
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (", ".join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||")
            generator_tqdm.set_description(description, refresh=False)

            if predictions_file is not None:
                predictions = json.dumps(
                    sanitize(model.make_output_human_readable(output_dict)))
                predictions_file.write(predictions + "\n")

        if predictions_file is not None:
            predictions_file.close()

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        if output_file is not None:
            dump_metrics(output_file, final_metrics, log=True)

        return final_metrics
Пример #43
0
 def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
     outputs = self._model.forward_on_instances(instances)
     return sanitize(outputs)
Пример #44
0
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]``
        and returns JSON that looks like

        .. code-block:: js

            [
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]},
                {"words": [...],
                 "verbs": [
                    {"verb": "...", "description": "...", "tags": [...]},
                    ...
                    {"verb": "...", "description": "...", "tags": [...]},
                ]}
            ]
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs]

        flattened_instances = [instance for sentence_instances in instances_per_sentence
                               for instance in sentence_instances]

        if not flattened_instances:
            return sanitize([{"verbs": [], "words": self._tokenizer.split_words(x["sentence"])}
                             for x in inputs])

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size, None)
        batched_instances[-1] = [instance for instance in batched_instances[-1]
                                 if instance is not None]
        # Run the model on the batches.
        outputs = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
        return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]

        output_index = 0
        for sentence_index, verb_count in enumerate(verbs_per_sentence):
            if verb_count == 0:
                # We didn't run any predictions for sentences with no verbs,
                # so we don't have a way to extract the original sentence.
                # Here we just tokenize the input again.
                original_text = self._tokenizer.split_words(inputs[sentence_index]["sentence"])
                return_dicts[sentence_index]["words"] = original_text
                continue

            for _ in range(verb_count):
                output = outputs[output_index]
                words = output["words"]
                tags = output['tags']
                description = self.make_srl_string(words, tags)
                return_dicts[sentence_index]["words"] = words
                return_dicts[sentence_index]["verbs"].append({
                        "verb": output["verb"],
                        "description": description,
                        "tags": tags,
                })
                output_index += 1

        return sanitize(return_dicts)
 def predict_batch_instance(self,
                            instances: List[Instance]) -> List[JsonDict]:
     for instance in instances:
         self._dataset_reader.apply_token_indexers(instance)
     outputs = self._model.forward_on_instances(instances)
     return sanitize(outputs)
Пример #46
0
    def predict_batch(self,
                      sents: List[List[str]],
                      batch_size: int = 256,
                      warm_up: int = 0,
                      beam_search: int = 1) -> JsonDict:
        sents_token = [self._tag_tokens(sent) for sent in sents]

        instances, insts_st, insts_ed = [], [], []
        # find all verbs in the input sentence
        for sent_token in sents_token:
            pred_ids = [
                i for (i, t) in enumerate(sent_token) if t.pos_ == 'VERB'
            ]
            insts_st.append(len(instances))
            instances.extend([
                self._json_to_instance({
                    'sentence': sent_token,
                    'predicate_index': pid
                }) for pid in pred_ids
            ])
            insts_ed.append(len(instances))

        # warm up the model using warm_up batch (mainly because of non-determinism of ELMo)
        if warm_up:
            for b in range(0, min(warm_up * batch_size, len(instances)),
                           batch_size):
                batch_inst = instances[b:b + batch_size]
                self._model.forward_on_instances(batch_inst)

        # run model
        outputs, probs = [], []
        for b in range(0, len(instances), batch_size):
            batch_inst = instances[b:b + batch_size]
            for prediction in self._model.forward_on_instances(batch_inst):
                all_tags, all_probs = self._beam_search(
                    prediction['class_probabilities'],
                    prediction['mask'],
                    n_best=beam_search)
                outputs.append(all_tags)
                probs.append(all_probs)

        results_li = []
        for sent_token, st, ed in zip(sents_token, insts_st, insts_ed):
            # consolidate predictions
            cur_o = [e for o in outputs[st:ed] for e in o]
            cur_p = [e for o in probs[st:ed] for e in o]

            # Build and return output dictionary
            results = {
                'verbs': [],
                'words': [token.text for token in sent_token]
            }

            for tags, prob in zip(cur_o, cur_p):
                # create description text
                description = make_oie_string(sent_token, tags)
                # add a predicate prediction to the return dictionary
                results['verbs'].append({
                    'verb':
                    get_predicate_text(sent_token, tags),
                    'description':
                    description,
                    'tags':
                    tags,
                    'probs':
                    prob,
                })
            results_li.append(results)

        return sanitize(results_li)
Пример #47
0
 def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
     outputs = self._model.forward_on_instances(instances)
     for output in outputs:
         output['answer'] = self._execute_logical_form_on_table(output['logical_form'],
                                                                output['original_table'])
     return sanitize(outputs)
Пример #48
0
    def attack_from_json(self,
                         inputs: JsonDict,
                         input_field_to_attack: str = 'tokens',
                         grad_input_field: str = 'grad_input_1',
                         ignore_tokens: List[str] = None,
                         target: JsonDict = None) -> JsonDict:
        """
        Replaces one token at a time from the input until the model's prediction changes.
        ``input_field_to_attack`` is for example ``tokens``, it says what the input field is
        called.  ``grad_input_field`` is for example ``grad_input_1``, which is a key into a grads
        dictionary.

        The method computes the gradient w.r.t. the tokens, finds the token with the maximum
        gradient (by L2 norm), and replaces it with another token based on the first-order Taylor
        approximation of the loss.  This process is iteratively repeated until the prediction
        changes.  Once a token is replaced, it is not flipped again.

        Parameters
        ----------
        inputs : ``JsonDict``
            The model inputs, the same as what is passed to a ``Predictor``.
        input_field_to_attack : ``str``, optional (default='tokens')
            The field that has the tokens that we're going to be flipping.  This must be a
            ``TextField``.
        grad_input_field : ``str``, optional (default='grad_input_1')
            If there is more than one field that gets embedded in your model (e.g., a question and
            a passage, or a premise and a hypothesis), this tells us the key to use to get the
            correct gradients.  This selects from the output of :func:`Predictor.get_gradients`.
        ignore_tokens : ``List[str]``, optional (default=DEFAULT_IGNORE_TOKENS)
            These tokens will not be flipped.  The default list includes some simple punctuation,
            OOV and padding tokens, and common control tokens for BERT, etc.
        target : ``JsonDict``, optional (default=None)
            If given, this will be a `targeted` hotflip attack, where instead of just trying to
            change a model's prediction from what it current is predicting, we try to change it to
            a `specific` target value.  This is a ``JsonDict`` because it needs to specify the
            field name and target value.  For example, for a masked LM, this would be something
            like ``{"words": ["she"]}``, because ``"words"`` is the field name, there is one mask
            token (hence the list of length one), and we want to change the prediction from
            whatever it was to ``"she"``.
        """
        if self.token_embedding is None:
            self.initialize()
        ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens

        # If `target` is `None`, we move away from the current prediction, otherwise we move
        # _towards_ the target.
        sign = -1 if target is None else 1
        instance = self.predictor._json_to_instance(inputs)
        if target is None:
            output_dict = self.predictor._model.forward_on_instance(instance)
        else:
            output_dict = target

        # This now holds the predictions that we want to change (either away from or towards,
        # depending on whether `target` was passed).  We'll use this in the loop below to check for
        # when we've met our stopping criterion.
        original_instances = self.predictor.predictions_to_labeled_instances(
            instance, output_dict)

        # This is just for ease of access in the UI, so we know the original tokens.  It's not used
        # in the logic below.
        original_text_field: TextField = original_instances[0][
            input_field_to_attack]  # type: ignore
        original_tokens = deepcopy(original_text_field.tokens)

        final_tokens = []
        # `original_instances` is a list because there might be several different predictions that
        # we're trying to attack (e.g., all of the NER tags for an input sentence).  We attack them
        # one at a time.
        for instance in original_instances:
            # Gets a list of the fields that we want to check to see if they change.
            fields_to_compare = utils.get_fields_to_compare(
                inputs, instance, input_field_to_attack)

            # We'll be modifying the tokens in this text field below, and grabbing the modified
            # list after the `while` loop.
            text_field: TextField = instance[
                input_field_to_attack]  # type: ignore

            # Because we can save computation by getting grads and outputs at the same time, we do
            # them together at the end of the loop, even though we use grads at the beginning and
            # outputs at the end.  This is our initial gradient for the beginning of the loop.  The
            # output can be ignored here.
            grads, outputs = self.predictor.get_gradients([instance])

            # Ignore any token that is in the ignore_tokens list by setting the token to already
            # flipped.
            flipped: List[int] = []
            for index, token in enumerate(text_field.tokens):
                if token.text in ignore_tokens:
                    flipped.append(index)
            if 'clusters' in outputs:
                # Coref unfortunately needs a special case here.  We don't want to flip words in
                # the same predicted coref cluster, but we can't really specify a list of tokens,
                # because, e.g., "he" could show up in several different clusters.
                # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to
                # return the set of tokens that shouldn't be changed for each instance?  E.g., you
                # could imagine setting a field on the `Token` object, that we could then read
                # here...
                for cluster in outputs['clusters']:
                    for mention in cluster:
                        for index in range(mention[0], mention[1] + 1):
                            flipped.append(index)

            while True:
                # Compute L2 norm of all grads.
                grad = grads[grad_input_field]
                grads_magnitude = [g.dot(g) for g in grad]

                # only flip a token once
                for index in flipped:
                    grads_magnitude[index] = -1

                # We flip the token with highest gradient norm.
                index_of_token_to_flip = numpy.argmax(grads_magnitude)
                if grads_magnitude[index_of_token_to_flip] == -1:
                    # If we've already flipped all of the tokens, we give up.
                    break
                flipped.append(index_of_token_to_flip)

                # TODO(mattg): This is quite a bit of a hack, both for gpt2 and for getting the
                # vocab id in general...  I don't have better ideas at the moment, though.
                indexer_name = 'tokens' if self.namespace == 'gpt2' else self.namespace
                input_tokens = text_field._indexed_tokens[indexer_name]
                original_id_of_token_to_flip = input_tokens[
                    index_of_token_to_flip]

                # Get new token using taylor approximation.
                new_id = self._first_order_taylor(
                    grad[index_of_token_to_flip],
                    self.token_embedding.weight,  # type: ignore
                    original_id_of_token_to_flip,
                    sign)

                # Flip token.  We need to tell the instance to re-index itself, so the text field
                # will actually update.
                new_token = Token(self.vocab._index_to_token[self.namespace]
                                  [new_id])  # type: ignore
                text_field.tokens[index_of_token_to_flip] = new_token
                instance.indexed = False

                # Get model predictions on instance, and then label the instances
                grads, outputs = self.predictor.get_gradients(
                    [instance])  # predictions
                for key, output in outputs.items():
                    if isinstance(output, torch.Tensor):
                        outputs[key] = output.detach().cpu().numpy().squeeze()
                    elif isinstance(output, list):
                        outputs[key] = output[0]

                # TODO(mattg): taking the first result here seems brittle, if we're in a case where
                # there are multiple predictions.
                labeled_instance = self.predictor.predictions_to_labeled_instances(
                    instance, outputs)[0]

                # If we've met our stopping criterion, we stop.
                has_changed = utils.instance_has_changed(
                    labeled_instance, fields_to_compare)
                if target is None and has_changed:
                    # With no target, we just want to change the prediction.
                    break
                if target is not None and not has_changed:
                    # With a given target, we want to *match* the target, which we check by
                    # `not has_changed`.
                    break

            final_tokens.append(text_field.tokens)

        return sanitize({
            "final": final_tokens,
            "original": original_tokens,
            "outputs": outputs
        })
Пример #49
0
 def predict_instance(self, instance: Instance) -> JsonDict:
     outputs = self._model.forward_on_instance(instance)
     outputs['answer'] = self._execute_logical_form_on_table(outputs['logical_form'],
                                                             outputs['original_table'])
     return sanitize(outputs)