Exemplo n.º 1
0
    def predict(self, inputs: JsonDict):
        result = None

        image_url = inputs.get("image_url")
        if image_url is not None:
            result = super().predict({
                "question": inputs["question"],
                "image": image_url
            })
        else:
            image = inputs.get("image")
            if image is not None:
                image_base64 = image["image_base64"]
                if image_base64 is not None:
                    with tempfile.NamedTemporaryFile(
                            prefix=f"{self.__class__.__name__}-") as f:
                        f.write(standard_b64decode(image_base64))
                        f.flush()
                        result = super().predict({
                            "question": inputs["question"],
                            "image": f.name
                        })

        if result is None:
            raise ValueError("No image found in request.")

        results = [{
            "answer": token,
            "confidence": score * 100
        } for token, score in result["tokens"].items()
                   if not token.startswith("@@")]
        results.sort(key=lambda x: -x["confidence"])
        return results[:45]  # Jon only wants the first 45 results.
 def process_output(self, output: JsonDict) -> JsonDict:
     pred_sent_orders = output.get('pred_sent_orders', None)
     num_sents = len(output['sent_labels']) # for removing padding
     if not pred_sent_orders is None:
         pred_chains = [order2chain(order) for order in pred_sent_orders]
         pred_chains = [ch for ch in pred_chains if all(c < num_sents for c in ch)]
         assert len(pred_chains) > 0, repr([order2chain(order) for order in pred_sent_orders]) + '\n' + 'num sents: %d' % num_sents + '\n%s' % output['_id']
     else:
         # get pred evdiences from sentences with top k ``gate_prob``
         gate_probs = output['gate_probs'][:num_sents]
         pred_chains = [[i] for i in sorted(range(num_sents), key=lambda x: gate_probs[x], reverse=True)[:10]]
     return {#'answer_texts': output['answer_texts'],
             #'best_span_str': output.get('best_span_str', None),
             #'best_span': output.get('best_span', None),
             'pred_sent_labels': output.get('pred_sent_labels', None),
             'pred_sent_orders': output.get('pred_sent_orders', None),
             'pred_chains': pred_chains,
             'possible_chain': output.get('evd_possible_chains', None),
             'question_tokens': output['question_tokens'],
             'passage_sent_tokens': output['passage_sent_tokens'],
             #'token_spans_sp': output['token_spans_sp'],
             #'token_spans_sent': output['token_spans_sent'],
             'sent_labels': output['sent_labels'],
             'ans_sent_idxs': output.get('ans_sent_idxs', None),
             '_id': output['_id']}
Exemplo n.º 3
0
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     """
     Expects JSON that looks like ``{"tokens": "[...]", "upos_tags": "[...]"}``.
     """
     tokens = json_dict["tokens"]
     gold_upos_tags = json_dict.get("upos_tags", None)
     gold_lemmas = json_dict.get("lemmas", None)
     return self._dataset_reader.text_to_instance(tokens=tokens,
                                                  upos_tags=gold_upos_tags,
                                                  lemmas=gold_lemmas)
Exemplo n.º 4
0
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:

        text = json_dict["text"]
        text = " " + text.strip()
        sentiment = json_dict["sentiment"]
        text_id = json_dict.get("TextID")
        if text_id is None:
            text_id = "<No-Text-id>"
        return self._dataset_reader.text_to_instance(
            text, sentiment, text_id, json_dict.get("selected_text")
        )
Exemplo n.º 5
0
 def _json_to_instance(
         self,  # type: ignore
         json_dict: JsonDict) -> Instance:
     premise_text = json_dict.get("sentence1", None) or json_dict.get(
         "premise", None)
     hypothesis_text = json_dict.get("sentence2", None) or json_dict.get(
         "hypothesis", None)
     if premise_text and hypothesis_text:
         return self._dataset_reader.text_to_instance(
             premise_text, hypothesis_text)
     logger.info("Error parsing input")
     return None
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     """
     Expects JSON that looks like ``{"question": "...", "fact": "..."}``.
     """
     if isinstance(json_dict["question"], dict):
         question_stem = json_dict["question"]["stem"]
         choices = [x["text"] for x in json_dict["question"]["choices"]]
     else:
         question_text = json_dict["question"]
         question_stem, choices = decompose_question(question_text)
     fact = json_dict.get("fact") or json_dict.get("fact1")
     span = json_dict.get("span") or json_dict.get("answer_spans")[0]
     spans = [span]
     if "relation" in json_dict:
         relations = [json_dict["relation"]]
     else:
         relations = None
     if "offset" in json_dict:
         offset = json_dict["offset"]
     elif "answer_starts" in json_dict:
         offset = json_dict["answer_starts"][0]
     else:
         offset = fact.index(span)
     if offset == -1:
         raise ValueError("Span: {} not found in fact: {}".format(
             span, fact))
     offsets = [offset]  #[(offset, offset + len(span))]
     if "id" in json_dict:
         qid = json_dict["id"]
     else:
         qid = random.randint(100)
     prefetched_sentences = json_dict.get("prefetched_sentences", None)
     prefetched_indices = json_dict.get("prefetched_indices", None)
     if prefetched_sentences is not None:
         return self._dataset_reader.text_to_instance(
             qid,
             question_stem,
             choices,
             fact,
             spans,
             relations,
             answer_starts=offsets,
             prefetched_sentences=prefetched_sentences,
             prefetched_indices=prefetched_indices)
     else:
         return self._dataset_reader.text_to_instance(qid,
                                                      question_stem,
                                                      choices,
                                                      fact,
                                                      spans,
                                                      relations,
                                                      answer_starts=offsets)
Exemplo n.º 7
0
 def _json_to_instance(self,  # type: ignore
                       json_dict: JsonDict) -> Instance:
     premises = json_dict["premises"]
     hypotheses = json_dict["hypotheses"]
     entailments = json_dict.get("entailments", None)
     if entailments is None:
         answer_indices = None
     else:
         answer_indices = [index for index, entailment in enumerate(entailments) if entailment]
     relevant_sentence_idxs = json_dict.get("relevant_sentence_idxs", None)
     return self._dataset_reader.text_to_instance(premises,
                                                  hypotheses,
                                                  answer_indices,
                                                  relevant_sentence_idxs)
Exemplo n.º 8
0
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     return self._dataset_reader.text_to_instance(
         words=json_dict['words'],
         ud_head_indices=json_dict['heads'],
         ud_tags=json_dict['tags'],
         ud_labels=json_dict['head_labels'],
         metadata=json_dict.get('metadata', None))
Exemplo n.º 9
0
 def _compatible_question(self, question_data: JsonDict) -> bool:
     question_id = question_data.get("id")
     if not question_id:
         return True
     if "_friction" not in self._lf_syntax:
         return True
     return "_Fr_" in question_id or "Friction" in question_id
Exemplo n.º 10
0
 def _my_json_to_instance(self,
                          json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
     # Make a cast here to satisfy mypy
     dataset_reader = cast(BertMCQAReader, self._dataset_reader)
     qid = json_dict['id']
     question_data = json_dict['question']
     question_text = question_data['stem']
     choice_text_list = [
         choice['text'] for choice in question_data['choices']
     ]
     choice_labels = [
         choice['label'] for choice in question_data['choices']
     ]
     choice_context_list = []
     context = json_dict.get("para", None)
     for choice in question_data['choices']:
         choice_context_list.append(choice.get("para", None))
     instance = dataset_reader.text_to_instance(
         qid,
         question_text,
         choice_text_list,
         context=context,
         choice_context_list=choice_context_list)
     extra_info = {'id': qid, 'choice_labels': choice_labels}
     return instance, extra_info
Exemplo n.º 11
0
    def _json_to_instance(self,
                          json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
        premise = json_dict.get('premise', None)
        hypothesis = json_dict.get('hypothesis', None)
        premise_entities = json_dict.get('premise_entities', None)
        hypothesis_entities = json_dict.get('hypothesis_entities', None)
        instance = self._dataset_reader.text_to_instance(
            premise=premise,
            hypothesis=hypothesis,
            premise_entities=premise_entities,
            hypothesis_entities=hypothesis_entities)

        label_dict = self._model.vocab.get_index_to_token_vocabulary('labels')
        all_labels = [label_dict[i] for i in range(len(label_dict))]

        return instance, {"all_labels": all_labels}
Exemplo n.º 12
0
 def _get_entity_tags(self, entities: List[str],
                      table_field: KnowledgeGraphField,
                      entity_literals: JsonDict,
                      tokenized_question: List[Token]) -> List[int]:
     res = []
     # Hackily access last two feature extractors for table field (span overlaps which don't
     # depend on the actual table information)
     features = table_field._feature_extractors[8:]  # pylint: disable=protected-access
     for i, token in enumerate(tokenized_question):
         tag_best = 0
         score_max = 0.0
         for tag_index, tag in enumerate(entities):
             literals = entity_literals.get(tag, [])
             if not isinstance(literals, list):
                 literals = [literals]
             for literal in literals:
                 tag_tokens = self._tokenizer.tokenize(literal.lower())
                 scores = [
                     fe(tag, tag_tokens, token, i, tokenized_question)
                     for fe in features
                 ]
                 # Small tie breaker in favor of longer sequences
                 score = max(scores) + len(tag_tokens) / 100
                 if score > score_max and score >= 0.5:
                     tag_best = tag_index + 1
                     score_max = score
         res.append(tag_best)
     return res
Exemplo n.º 13
0
 def _compatible_question(self, question_data: JsonDict) -> bool:
     question_id = question_data.get('id')
     if not question_id:
         return True
     if not '_friction' in self._lf_syntax:
         return True
     return '_Fr_' in question_id or 'Friction' in question_id
Exemplo n.º 14
0
 def _compatible_question(self, question_data: JsonDict) -> bool:
     question_id = question_data.get('id')
     if not question_id:
         return True
     if not '_friction' in self._lf_syntax:
         return True
     return '_Fr_' in question_id or 'Friction' in question_id
Exemplo n.º 15
0
 def _get_entity_tags(self,
                      entities: List[str],
                      table_field: KnowledgeGraphField,
                      entity_literals: JsonDict,
                      tokenized_question: List[Token]) -> List[int]:
     res = []
     # Hackily access last two feature extractors for table field (span overlaps which don't
     # depend on the actual table information)
     features = table_field._feature_extractors[8:]  # pylint: disable=protected-access
     for i, token in enumerate(tokenized_question):
         tag_best = 0
         score_max = 0.0
         for tag_index, tag in enumerate(entities):
             literals = entity_literals.get(tag, [])
             if not isinstance(literals, list):
                 literals = [literals]
             for literal in literals:
                 tag_tokens = self._tokenizer.tokenize(literal.lower())
                 scores = [fe(tag, tag_tokens, token, i, tokenized_question) for fe in features]
                 # Small tie breaker in favor of longer sequences
                 score = max(scores) + len(tag_tokens)/100
                 if score > score_max and score >= 0.5:
                     tag_best = tag_index + 1
                     score_max = score
         res.append(tag_best)
     return res
Exemplo n.º 16
0
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        We need to override this because of the interactive beam search aspects.
        """

        instance = self._json_to_instance(inputs)

        # Get the rules out of the instance
        index_to_rule = [
            production_rule_field.rule
            for production_rule_field in instance.fields["actions"].field_list
        ]
        rule_to_index = {rule: i for i, rule in enumerate(index_to_rule)}

        # A sequence of strings to force, then convert them to ints
        initial_tokens = inputs.get("initial_sequence", [])

        # Want to get initial_sequence on the same device as the model.
        initial_sequence = torch.tensor(
            [rule_to_index[token] for token in initial_tokens],
            device=next(self._model.parameters()).device,
        )

        # Replace beam search with one that forces the initial sequence
        original_beam_search = self._model._beam_search
        interactive_beam_search = original_beam_search.constrained_to(
            initial_sequence)
        self._model._beam_search = interactive_beam_search

        # Now get results
        results = self.predict_instance(instance)

        # And add in the choices. Need to convert from idxs to rules.
        results["choices"] = [[
            (probability, action) for probability, action in zip(
                pa["action_probabilities"], pa["considered_actions"])
        ] for pa in results["predicted_actions"]]

        results["beam_snapshots"] = {
            # For each batch_index, we get a list of beam snapshots
            batch_index: [
                # Each beam_snapshots consists of a list of timesteps,
                # each of which is a list of pairs (score, sequence).
                # The sequence is the *indices* of the rules, which we
                # want to convert to the string representations.
                [(score, [index_to_rule[idx] for idx in sequence])
                 for score, sequence in timestep_snapshot]
                for timestep_snapshot in beam_snapshots
            ]
            for batch_index, beam_snapshots in
            interactive_beam_search.beam_snapshots.items()
        }

        # Restore original beam search
        self._model._beam_search = original_beam_search

        return results
Exemplo n.º 17
0
def convert_qajson_to_entailment(qa_json: JsonDict):
    question_text = qa_json["question"]["stem"]
    choices = qa_json["question"]["choices"]
    for choice in choices:
        choice_text = choice["text"]

        statement = create_hypothesis(get_fitb_from_question(question_text), choice_text)
        create_output_dict(qa_json, statement,  choice["label"] == qa_json.get("answerKey", "Z"))

    return qa_json
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"question": "...", "passage": "..."}``.
        """
        question_text = json_dict["question"]
        background = json_dict["background"]
        situation = json_dict.get("situation")

        return self._dataset_reader.text_to_instance(question_text, background,
                                                     situation)
Exemplo n.º 19
0
    def attack(self, attacker_id: str, attack: JsonDict) -> JsonDict:
        """
        Modifies the input (e.g. by adding or removing tokens) to try to change the model's predicti$
        in some desired manner.
        """
        if attacker_id not in config.VALID_ATTACKERS:
            raise http.UnknownAttackerError(attacker_id)
        attacker = self.attackers.get(attacker_id)
        if attacker is None:
            raise http.InvalidAttackerError(attacker_id)

        print('attack', attack)

        inputs = attack['inputs']
        input_field_to_attack = attack.get('input_field_to_attack', 'tokens')
        grad_input_field = attack.get('grad_input_field', 'grad_input_1')
        ignore_tokens = attack.get('ignore_tokens', None)
        target = attack.get('target', None)

        if target is not None:
            raise ValueError(
                "Input reduction does not implement targeted attacks")
        ignore_tokens = ["@@NULL@@"
                         ] if ignore_tokens is None else ignore_tokens

        original_instances = self.predictor.labeled_json_to_labeled_instances(
            inputs)

        final_tokens = {}
        original_tokens = {}
        for idx, instance in sorted(original_instances.items()):
            final_tokens[idx] = (attacker._attack_instance(
                inputs, instance, input_field_to_attack, grad_input_field,
                ignore_tokens))
            original_tokens[idx] = deepcopy(
                instance[input_field_to_attack].tokens)
        return sanitize({"final": final_tokens, "original": original_tokens})
Exemplo n.º 20
0
    def dump_line(self, outputs: JsonDict) -> str:

        if not self.numeric:
            prediction = outputs["label"]
        else:
            prediction = outputs["prediction"]
            if isinstance(prediction, float):
                prediction = min(max(prediction, 0), 5)
                prediction = f"{prediction:.3f}"

        output = {
            "idx": int(outputs["index"]),
            # "label": prediction,
            "pseudolabel": outputs["logits"],
            **outputs.get("raw_input", {})
        }

        return json.dumps(output, ensure_ascii=False) + "\n"
Exemplo n.º 21
0
    def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict:
        # read one json instance from prostruct
        # sentence_texts: List[str]
        # participants: List[str],
        # states: List[List[str]], where states[i][j] is ith participant at time j

        # Para id is useful for decoder trainer. As we won't call it at prediction time,
        # we make this optional.
        para_id = inputs.get("para_id", -1)
        sentence_texts = inputs["sentence_texts"]
        sentence_texts = sentence_texts if "\n" not in sentence_texts else [s for s in sentence_texts.split("\n")]
        participants = inputs["participants"]
        if not participants:
            participants = [p for p in self.helper.participants_from_sentences(sentence_texts)]
        # Participants can be separated in many different ways
        # (A participant can contain comma and in those cases we separate by "\n" or "\t").
        # Do this only when participants is not already a list (demo passes a string).
        if isinstance(participants, str):
            if "\n" in participants:
                separator = "\n"
            elif "\t" in participants:
                separator = "\t"
            else:
                separator = ","
            participants = [p.strip() for p in participants.split(separator)]
            participants = participants if "," not in participants else [p.strip() for p in participants.split(",")]
        states = inputs.get("states", None)
        # Can be used in demo (eventually the demo would control more parameters such as which commonsense etc).
        top_k_sequences = inputs.get("top_k_sequences", 2)
        print(f"Predictor gets input: ", inputs)
        print(f"Predictor formats inputs =\n{para_id},\n{sentence_texts}\n{participants}")

        instance = self._dataset_reader.text_to_instance(para_id=para_id,
                                                         sentence_texts=sentence_texts,
                                                         participants=list(participants),
                                                         states=states,
                                                         filename="test"
                                                         # rules_activated="0,0,0,0"
                                                         )  # convert from set

        # Can we update instance based on self.proparaDecoderStep.update_rules()
        old_action_scorer = self._model.decoder_step.get_action_scorer()
        old_valid_action_gen = self._model.decoder_step.get_valid_action_generator()

        rules_used_original = self._model.decoder_step.get_valid_action_generator().get_rules_used()

        dont_use_kb = "dont_use_kb" in inputs and inputs["dont_use_kb"]
        if dont_use_kb:
            self._model.decoder_step.change_action_scorer(ActionScorerDummy())

        rules_changed = "rules_used" in inputs and inputs["rules_used"] is not None
        if rules_changed:
            updated_rules = [True if int(rule_val.strip()) > 0 else False
                             for rule_val in inputs["rules_used"].split(",")]
            self._model.decoder_step.get_valid_action_generator().set_rules_used(updated_rules)

        outputs = self._model.forward_on_instance(instance)

        # Reset to original settings.
        if dont_use_kb:
            self._model.decoder_step.change_action_scorer(old_action_scorer)
        if rules_changed:
            self._model.decoder_step.change_valid_action_generator(old_valid_action_gen)

        json_outputs = ProStructPredictor.to_json(
            outputs,
            participants,
            top_k_sequences
        )
        json_outputs["default_kb_used"] = self._model.decoder_step.get_action_scorer().name
        json_outputs["default_rules_used"] = rules_used_original
        json_outputs['predicted_locations'] = self.predict_locations(outputs, sentence_texts, participants)

        settings_used = ""
        if rules_changed or dont_use_kb:
            settings_used = f"rules used: {inputs.get('rules_used', '')} and using {'no kb' if dont_use_kb else 'kb'}"
        json_outputs['settings_used'] = settings_used

        json_outputs["sentences"] = sentence_texts
        return {**inputs, **json_outputs}
Exemplo n.º 22
0
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     return self._dataset_reader.text_to_instance(
         json_dict["sentence1"], json_dict["sentence2"],
         json_dict.get("gold_label"))
Exemplo n.º 23
0
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     text = json_dict['text']
     entity = json_dict.get('entity')
     metadata = json_dict.get('metadata')
     return self._dataset_reader.text_to_instance(text, entity, metadata)
 def _json_to_instance(self,
                       json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
     index = json_dict.get('index', None)
     article = json_dict['article']
     instance = self._dataset_reader.text_to_instance(source_string=article)
     return instance, {'index': index} if index is not None else {}
Exemplo n.º 25
0
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     tokens = json_dict['sentence']
     sentiment = json_dict.get('sentiment')
     instance = self._dataset_reader.text_to_instance(tokens=tokens, sentiment=sentiment)
     return instance