def predict(self, inputs: JsonDict): result = None image_url = inputs.get("image_url") if image_url is not None: result = super().predict({ "question": inputs["question"], "image": image_url }) else: image = inputs.get("image") if image is not None: image_base64 = image["image_base64"] if image_base64 is not None: with tempfile.NamedTemporaryFile( prefix=f"{self.__class__.__name__}-") as f: f.write(standard_b64decode(image_base64)) f.flush() result = super().predict({ "question": inputs["question"], "image": f.name }) if result is None: raise ValueError("No image found in request.") results = [{ "answer": token, "confidence": score * 100 } for token, score in result["tokens"].items() if not token.startswith("@@")] results.sort(key=lambda x: -x["confidence"]) return results[:45] # Jon only wants the first 45 results.
def process_output(self, output: JsonDict) -> JsonDict: pred_sent_orders = output.get('pred_sent_orders', None) num_sents = len(output['sent_labels']) # for removing padding if not pred_sent_orders is None: pred_chains = [order2chain(order) for order in pred_sent_orders] pred_chains = [ch for ch in pred_chains if all(c < num_sents for c in ch)] assert len(pred_chains) > 0, repr([order2chain(order) for order in pred_sent_orders]) + '\n' + 'num sents: %d' % num_sents + '\n%s' % output['_id'] else: # get pred evdiences from sentences with top k ``gate_prob`` gate_probs = output['gate_probs'][:num_sents] pred_chains = [[i] for i in sorted(range(num_sents), key=lambda x: gate_probs[x], reverse=True)[:10]] return {#'answer_texts': output['answer_texts'], #'best_span_str': output.get('best_span_str', None), #'best_span': output.get('best_span', None), 'pred_sent_labels': output.get('pred_sent_labels', None), 'pred_sent_orders': output.get('pred_sent_orders', None), 'pred_chains': pred_chains, 'possible_chain': output.get('evd_possible_chains', None), 'question_tokens': output['question_tokens'], 'passage_sent_tokens': output['passage_sent_tokens'], #'token_spans_sp': output['token_spans_sp'], #'token_spans_sent': output['token_spans_sent'], 'sent_labels': output['sent_labels'], 'ans_sent_idxs': output.get('ans_sent_idxs', None), '_id': output['_id']}
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"tokens": "[...]", "upos_tags": "[...]"}``. """ tokens = json_dict["tokens"] gold_upos_tags = json_dict.get("upos_tags", None) gold_lemmas = json_dict.get("lemmas", None) return self._dataset_reader.text_to_instance(tokens=tokens, upos_tags=gold_upos_tags, lemmas=gold_lemmas)
def _json_to_instance(self, json_dict: JsonDict) -> Instance: text = json_dict["text"] text = " " + text.strip() sentiment = json_dict["sentiment"] text_id = json_dict.get("TextID") if text_id is None: text_id = "<No-Text-id>" return self._dataset_reader.text_to_instance( text, sentiment, text_id, json_dict.get("selected_text") )
def _json_to_instance( self, # type: ignore json_dict: JsonDict) -> Instance: premise_text = json_dict.get("sentence1", None) or json_dict.get( "premise", None) hypothesis_text = json_dict.get("sentence2", None) or json_dict.get( "hypothesis", None) if premise_text and hypothesis_text: return self._dataset_reader.text_to_instance( premise_text, hypothesis_text) logger.info("Error parsing input") return None
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"question": "...", "fact": "..."}``. """ if isinstance(json_dict["question"], dict): question_stem = json_dict["question"]["stem"] choices = [x["text"] for x in json_dict["question"]["choices"]] else: question_text = json_dict["question"] question_stem, choices = decompose_question(question_text) fact = json_dict.get("fact") or json_dict.get("fact1") span = json_dict.get("span") or json_dict.get("answer_spans")[0] spans = [span] if "relation" in json_dict: relations = [json_dict["relation"]] else: relations = None if "offset" in json_dict: offset = json_dict["offset"] elif "answer_starts" in json_dict: offset = json_dict["answer_starts"][0] else: offset = fact.index(span) if offset == -1: raise ValueError("Span: {} not found in fact: {}".format( span, fact)) offsets = [offset] #[(offset, offset + len(span))] if "id" in json_dict: qid = json_dict["id"] else: qid = random.randint(100) prefetched_sentences = json_dict.get("prefetched_sentences", None) prefetched_indices = json_dict.get("prefetched_indices", None) if prefetched_sentences is not None: return self._dataset_reader.text_to_instance( qid, question_stem, choices, fact, spans, relations, answer_starts=offsets, prefetched_sentences=prefetched_sentences, prefetched_indices=prefetched_indices) else: return self._dataset_reader.text_to_instance(qid, question_stem, choices, fact, spans, relations, answer_starts=offsets)
def _json_to_instance(self, # type: ignore json_dict: JsonDict) -> Instance: premises = json_dict["premises"] hypotheses = json_dict["hypotheses"] entailments = json_dict.get("entailments", None) if entailments is None: answer_indices = None else: answer_indices = [index for index, entailment in enumerate(entailments) if entailment] relevant_sentence_idxs = json_dict.get("relevant_sentence_idxs", None) return self._dataset_reader.text_to_instance(premises, hypotheses, answer_indices, relevant_sentence_idxs)
def _json_to_instance(self, json_dict: JsonDict) -> Instance: return self._dataset_reader.text_to_instance( words=json_dict['words'], ud_head_indices=json_dict['heads'], ud_tags=json_dict['tags'], ud_labels=json_dict['head_labels'], metadata=json_dict.get('metadata', None))
def _compatible_question(self, question_data: JsonDict) -> bool: question_id = question_data.get("id") if not question_id: return True if "_friction" not in self._lf_syntax: return True return "_Fr_" in question_id or "Friction" in question_id
def _my_json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: # Make a cast here to satisfy mypy dataset_reader = cast(BertMCQAReader, self._dataset_reader) qid = json_dict['id'] question_data = json_dict['question'] question_text = question_data['stem'] choice_text_list = [ choice['text'] for choice in question_data['choices'] ] choice_labels = [ choice['label'] for choice in question_data['choices'] ] choice_context_list = [] context = json_dict.get("para", None) for choice in question_data['choices']: choice_context_list.append(choice.get("para", None)) instance = dataset_reader.text_to_instance( qid, question_text, choice_text_list, context=context, choice_context_list=choice_context_list) extra_info = {'id': qid, 'choice_labels': choice_labels} return instance, extra_info
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: premise = json_dict.get('premise', None) hypothesis = json_dict.get('hypothesis', None) premise_entities = json_dict.get('premise_entities', None) hypothesis_entities = json_dict.get('hypothesis_entities', None) instance = self._dataset_reader.text_to_instance( premise=premise, hypothesis=hypothesis, premise_entities=premise_entities, hypothesis_entities=hypothesis_entities) label_dict = self._model.vocab.get_index_to_token_vocabulary('labels') all_labels = [label_dict[i] for i in range(len(label_dict))] return instance, {"all_labels": all_labels}
def _get_entity_tags(self, entities: List[str], table_field: KnowledgeGraphField, entity_literals: JsonDict, tokenized_question: List[Token]) -> List[int]: res = [] # Hackily access last two feature extractors for table field (span overlaps which don't # depend on the actual table information) features = table_field._feature_extractors[8:] # pylint: disable=protected-access for i, token in enumerate(tokenized_question): tag_best = 0 score_max = 0.0 for tag_index, tag in enumerate(entities): literals = entity_literals.get(tag, []) if not isinstance(literals, list): literals = [literals] for literal in literals: tag_tokens = self._tokenizer.tokenize(literal.lower()) scores = [ fe(tag, tag_tokens, token, i, tokenized_question) for fe in features ] # Small tie breaker in favor of longer sequences score = max(scores) + len(tag_tokens) / 100 if score > score_max and score >= 0.5: tag_best = tag_index + 1 score_max = score res.append(tag_best) return res
def _compatible_question(self, question_data: JsonDict) -> bool: question_id = question_data.get('id') if not question_id: return True if not '_friction' in self._lf_syntax: return True return '_Fr_' in question_id or 'Friction' in question_id
def _compatible_question(self, question_data: JsonDict) -> bool: question_id = question_data.get('id') if not question_id: return True if not '_friction' in self._lf_syntax: return True return '_Fr_' in question_id or 'Friction' in question_id
def _get_entity_tags(self, entities: List[str], table_field: KnowledgeGraphField, entity_literals: JsonDict, tokenized_question: List[Token]) -> List[int]: res = [] # Hackily access last two feature extractors for table field (span overlaps which don't # depend on the actual table information) features = table_field._feature_extractors[8:] # pylint: disable=protected-access for i, token in enumerate(tokenized_question): tag_best = 0 score_max = 0.0 for tag_index, tag in enumerate(entities): literals = entity_literals.get(tag, []) if not isinstance(literals, list): literals = [literals] for literal in literals: tag_tokens = self._tokenizer.tokenize(literal.lower()) scores = [fe(tag, tag_tokens, token, i, tokenized_question) for fe in features] # Small tie breaker in favor of longer sequences score = max(scores) + len(tag_tokens)/100 if score > score_max and score >= 0.5: tag_best = tag_index + 1 score_max = score res.append(tag_best) return res
def predict_json(self, inputs: JsonDict) -> JsonDict: """ We need to override this because of the interactive beam search aspects. """ instance = self._json_to_instance(inputs) # Get the rules out of the instance index_to_rule = [ production_rule_field.rule for production_rule_field in instance.fields["actions"].field_list ] rule_to_index = {rule: i for i, rule in enumerate(index_to_rule)} # A sequence of strings to force, then convert them to ints initial_tokens = inputs.get("initial_sequence", []) # Want to get initial_sequence on the same device as the model. initial_sequence = torch.tensor( [rule_to_index[token] for token in initial_tokens], device=next(self._model.parameters()).device, ) # Replace beam search with one that forces the initial sequence original_beam_search = self._model._beam_search interactive_beam_search = original_beam_search.constrained_to( initial_sequence) self._model._beam_search = interactive_beam_search # Now get results results = self.predict_instance(instance) # And add in the choices. Need to convert from idxs to rules. results["choices"] = [[ (probability, action) for probability, action in zip( pa["action_probabilities"], pa["considered_actions"]) ] for pa in results["predicted_actions"]] results["beam_snapshots"] = { # For each batch_index, we get a list of beam snapshots batch_index: [ # Each beam_snapshots consists of a list of timesteps, # each of which is a list of pairs (score, sequence). # The sequence is the *indices* of the rules, which we # want to convert to the string representations. [(score, [index_to_rule[idx] for idx in sequence]) for score, sequence in timestep_snapshot] for timestep_snapshot in beam_snapshots ] for batch_index, beam_snapshots in interactive_beam_search.beam_snapshots.items() } # Restore original beam search self._model._beam_search = original_beam_search return results
def convert_qajson_to_entailment(qa_json: JsonDict): question_text = qa_json["question"]["stem"] choices = qa_json["question"]["choices"] for choice in choices: choice_text = choice["text"] statement = create_hypothesis(get_fitb_from_question(question_text), choice_text) create_output_dict(qa_json, statement, choice["label"] == qa_json.get("answerKey", "Z")) return qa_json
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"question": "...", "passage": "..."}``. """ question_text = json_dict["question"] background = json_dict["background"] situation = json_dict.get("situation") return self._dataset_reader.text_to_instance(question_text, background, situation)
def attack(self, attacker_id: str, attack: JsonDict) -> JsonDict: """ Modifies the input (e.g. by adding or removing tokens) to try to change the model's predicti$ in some desired manner. """ if attacker_id not in config.VALID_ATTACKERS: raise http.UnknownAttackerError(attacker_id) attacker = self.attackers.get(attacker_id) if attacker is None: raise http.InvalidAttackerError(attacker_id) print('attack', attack) inputs = attack['inputs'] input_field_to_attack = attack.get('input_field_to_attack', 'tokens') grad_input_field = attack.get('grad_input_field', 'grad_input_1') ignore_tokens = attack.get('ignore_tokens', None) target = attack.get('target', None) if target is not None: raise ValueError( "Input reduction does not implement targeted attacks") ignore_tokens = ["@@NULL@@" ] if ignore_tokens is None else ignore_tokens original_instances = self.predictor.labeled_json_to_labeled_instances( inputs) final_tokens = {} original_tokens = {} for idx, instance in sorted(original_instances.items()): final_tokens[idx] = (attacker._attack_instance( inputs, instance, input_field_to_attack, grad_input_field, ignore_tokens)) original_tokens[idx] = deepcopy( instance[input_field_to_attack].tokens) return sanitize({"final": final_tokens, "original": original_tokens})
def dump_line(self, outputs: JsonDict) -> str: if not self.numeric: prediction = outputs["label"] else: prediction = outputs["prediction"] if isinstance(prediction, float): prediction = min(max(prediction, 0), 5) prediction = f"{prediction:.3f}" output = { "idx": int(outputs["index"]), # "label": prediction, "pseudolabel": outputs["logits"], **outputs.get("raw_input", {}) } return json.dumps(output, ensure_ascii=False) + "\n"
def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict: # read one json instance from prostruct # sentence_texts: List[str] # participants: List[str], # states: List[List[str]], where states[i][j] is ith participant at time j # Para id is useful for decoder trainer. As we won't call it at prediction time, # we make this optional. para_id = inputs.get("para_id", -1) sentence_texts = inputs["sentence_texts"] sentence_texts = sentence_texts if "\n" not in sentence_texts else [s for s in sentence_texts.split("\n")] participants = inputs["participants"] if not participants: participants = [p for p in self.helper.participants_from_sentences(sentence_texts)] # Participants can be separated in many different ways # (A participant can contain comma and in those cases we separate by "\n" or "\t"). # Do this only when participants is not already a list (demo passes a string). if isinstance(participants, str): if "\n" in participants: separator = "\n" elif "\t" in participants: separator = "\t" else: separator = "," participants = [p.strip() for p in participants.split(separator)] participants = participants if "," not in participants else [p.strip() for p in participants.split(",")] states = inputs.get("states", None) # Can be used in demo (eventually the demo would control more parameters such as which commonsense etc). top_k_sequences = inputs.get("top_k_sequences", 2) print(f"Predictor gets input: ", inputs) print(f"Predictor formats inputs =\n{para_id},\n{sentence_texts}\n{participants}") instance = self._dataset_reader.text_to_instance(para_id=para_id, sentence_texts=sentence_texts, participants=list(participants), states=states, filename="test" # rules_activated="0,0,0,0" ) # convert from set # Can we update instance based on self.proparaDecoderStep.update_rules() old_action_scorer = self._model.decoder_step.get_action_scorer() old_valid_action_gen = self._model.decoder_step.get_valid_action_generator() rules_used_original = self._model.decoder_step.get_valid_action_generator().get_rules_used() dont_use_kb = "dont_use_kb" in inputs and inputs["dont_use_kb"] if dont_use_kb: self._model.decoder_step.change_action_scorer(ActionScorerDummy()) rules_changed = "rules_used" in inputs and inputs["rules_used"] is not None if rules_changed: updated_rules = [True if int(rule_val.strip()) > 0 else False for rule_val in inputs["rules_used"].split(",")] self._model.decoder_step.get_valid_action_generator().set_rules_used(updated_rules) outputs = self._model.forward_on_instance(instance) # Reset to original settings. if dont_use_kb: self._model.decoder_step.change_action_scorer(old_action_scorer) if rules_changed: self._model.decoder_step.change_valid_action_generator(old_valid_action_gen) json_outputs = ProStructPredictor.to_json( outputs, participants, top_k_sequences ) json_outputs["default_kb_used"] = self._model.decoder_step.get_action_scorer().name json_outputs["default_rules_used"] = rules_used_original json_outputs['predicted_locations'] = self.predict_locations(outputs, sentence_texts, participants) settings_used = "" if rules_changed or dont_use_kb: settings_used = f"rules used: {inputs.get('rules_used', '')} and using {'no kb' if dont_use_kb else 'kb'}" json_outputs['settings_used'] = settings_used json_outputs["sentences"] = sentence_texts return {**inputs, **json_outputs}
def _json_to_instance(self, json_dict: JsonDict) -> Instance: return self._dataset_reader.text_to_instance( json_dict["sentence1"], json_dict["sentence2"], json_dict.get("gold_label"))
def _json_to_instance(self, json_dict: JsonDict) -> Instance: text = json_dict['text'] entity = json_dict.get('entity') metadata = json_dict.get('metadata') return self._dataset_reader.text_to_instance(text, entity, metadata)
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: index = json_dict.get('index', None) article = json_dict['article'] instance = self._dataset_reader.text_to_instance(source_string=article) return instance, {'index': index} if index is not None else {}
def _json_to_instance(self, json_dict: JsonDict) -> Instance: tokens = json_dict['sentence'] sentiment = json_dict.get('sentiment') instance = self._dataset_reader.text_to_instance(tokens=tokens, sentiment=sentiment) return instance