def process_output(self, output: JsonDict) -> JsonDict: pred_sent_orders = output.get('pred_sent_orders', None) num_sents = len(output['sent_labels']) # for removing padding if not pred_sent_orders is None: pred_chains = [order2chain(order) for order in pred_sent_orders] pred_chains = [ch for ch in pred_chains if all(c < num_sents for c in ch)] assert len(pred_chains) > 0, repr([order2chain(order) for order in pred_sent_orders]) + '\n' + 'num sents: %d' % num_sents + '\n%s' % output['_id'] else: # get pred evdiences from sentences with top k ``gate_prob`` gate_probs = output['gate_probs'][:num_sents] pred_chains = [[i] for i in sorted(range(num_sents), key=lambda x: gate_probs[x], reverse=True)[:10]] return {#'answer_texts': output['answer_texts'], #'best_span_str': output.get('best_span_str', None), #'best_span': output.get('best_span', None), 'pred_sent_labels': output.get('pred_sent_labels', None), 'pred_sent_orders': output.get('pred_sent_orders', None), 'pred_chains': pred_chains, 'possible_chain': output.get('evd_possible_chains', None), 'question_tokens': output['question_tokens'], 'passage_sent_tokens': output['passage_sent_tokens'], #'token_spans_sp': output['token_spans_sp'], #'token_spans_sent': output['token_spans_sent'], 'sent_labels': output['sent_labels'], 'ans_sent_idxs': output.get('ans_sent_idxs', None), '_id': output['_id']}
def predict(self, inputs: JsonDict): result = None image_url = inputs.get("image_url") if image_url is not None: result = super().predict({ "question": inputs["question"], "image": image_url }) else: image = inputs.get("image") if image is not None: image_base64 = image["image_base64"] if image_base64 is not None: with tempfile.NamedTemporaryFile( prefix=f"{self.__class__.__name__}-") as f: f.write(standard_b64decode(image_base64)) f.flush() result = super().predict({ "question": inputs["question"], "image": f.name }) if result is None: raise ValueError("No image found in request.") results = [{ "answer": token, "confidence": score * 100 } for token, score in result["tokens"].items() if not token.startswith("@@")] results.sort(key=lambda x: -x["confidence"]) return results[:45] # Jon only wants the first 45 results.
def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use """ If you don't want your outputs in JSON-lines format you can override this function to output them differently. """ keys = ['citation_id', 'prediction', 'probabilities', 'citation_text'] for k in outputs.copy(): if k not in keys: outputs.pop(k) return json.dumps(outputs, cls=JsonFloatEncoder) + "\n"
def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use """ If you don't want your outputs in JSON-lines format you can override this function to output them differently. """ keys = ['citedPaperId', 'citingPaperId', 'excerptCitationIntents'] for k in outputs.copy(): if k not in keys: outputs.pop(k) return json.dumps(outputs, cls=JsonFloatEncoder) + "\n"
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"tokens": "[...]", "upos_tags": "[...]"}``. """ tokens = json_dict["tokens"] gold_upos_tags = json_dict.get("upos_tags", None) gold_lemmas = json_dict.get("lemmas", None) return self._dataset_reader.text_to_instance(tokens=tokens, upos_tags=gold_upos_tags, lemmas=gold_lemmas)
def _json_to_instance(self, json_dict: JsonDict) -> Instance: text = json_dict["text"] text = " " + text.strip() sentiment = json_dict["sentiment"] text_id = json_dict.get("TextID") if text_id is None: text_id = "<No-Text-id>" return self._dataset_reader.text_to_instance( text, sentiment, text_id, json_dict.get("selected_text") )
def _json_to_instance( self, # type: ignore json_dict: JsonDict) -> Instance: premise_text = json_dict.get("sentence1", None) or json_dict.get( "premise", None) hypothesis_text = json_dict.get("sentence2", None) or json_dict.get( "hypothesis", None) if premise_text and hypothesis_text: return self._dataset_reader.text_to_instance( premise_text, hypothesis_text) logger.info("Error parsing input") return None
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"question": "...", "fact": "..."}``. """ if isinstance(json_dict["question"], dict): question_stem = json_dict["question"]["stem"] choices = [x["text"] for x in json_dict["question"]["choices"]] else: question_text = json_dict["question"] question_stem, choices = decompose_question(question_text) fact = json_dict.get("fact") or json_dict.get("fact1") span = json_dict.get("span") or json_dict.get("answer_spans")[0] spans = [span] if "relation" in json_dict: relations = [json_dict["relation"]] else: relations = None if "offset" in json_dict: offset = json_dict["offset"] elif "answer_starts" in json_dict: offset = json_dict["answer_starts"][0] else: offset = fact.index(span) if offset == -1: raise ValueError("Span: {} not found in fact: {}".format( span, fact)) offsets = [offset] #[(offset, offset + len(span))] if "id" in json_dict: qid = json_dict["id"] else: qid = random.randint(100) prefetched_sentences = json_dict.get("prefetched_sentences", None) prefetched_indices = json_dict.get("prefetched_indices", None) if prefetched_sentences is not None: return self._dataset_reader.text_to_instance( qid, question_stem, choices, fact, spans, relations, answer_starts=offsets, prefetched_sentences=prefetched_sentences, prefetched_indices=prefetched_indices) else: return self._dataset_reader.text_to_instance(qid, question_stem, choices, fact, spans, relations, answer_starts=offsets)
def _json_to_instance(self, # type: ignore json_dict: JsonDict) -> Instance: premises = json_dict["premises"] hypotheses = json_dict["hypotheses"] entailments = json_dict.get("entailments", None) if entailments is None: answer_indices = None else: answer_indices = [index for index, entailment in enumerate(entailments) if entailment] relevant_sentence_idxs = json_dict.get("relevant_sentence_idxs", None) return self._dataset_reader.text_to_instance(premises, hypotheses, answer_indices, relevant_sentence_idxs)
def _compatible_question(self, question_data: JsonDict) -> bool: question_id = question_data.get("id") if not question_id: return True if "_friction" not in self._lf_syntax: return True return "_Fr_" in question_id or "Friction" in question_id
def _my_json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: # Make a cast here to satisfy mypy dataset_reader = cast(BertMCQAReader, self._dataset_reader) qid = json_dict['id'] question_data = json_dict['question'] question_text = question_data['stem'] choice_text_list = [ choice['text'] for choice in question_data['choices'] ] choice_labels = [ choice['label'] for choice in question_data['choices'] ] choice_context_list = [] context = json_dict.get("para", None) for choice in question_data['choices']: choice_context_list.append(choice.get("para", None)) instance = dataset_reader.text_to_instance( qid, question_text, choice_text_list, context=context, choice_context_list=choice_context_list) extra_info = {'id': qid, 'choice_labels': choice_labels} return instance, extra_info
def _get_entity_literals(self, question_data: JsonDict) -> JsonDict: res: JsonDict = {} for key, value in question_data.items(): if '_literals' in key and key.replace('_literals', '') in self._entity_types: res.update(value) return res
def _json_to_instance(self, json_dict: JsonDict) -> Instance: return self._dataset_reader.text_to_instance( words=json_dict['words'], ud_head_indices=json_dict['heads'], ud_tags=json_dict['tags'], ud_labels=json_dict['head_labels'], metadata=json_dict.get('metadata', None))
def _compatible_question(self, question_data: JsonDict) -> bool: question_id = question_data.get('id') if not question_id: return True if not '_friction' in self._lf_syntax: return True return '_Fr_' in question_id or 'Friction' in question_id
def _get_entity_tags(self, entities: List[str], table_field: KnowledgeGraphField, entity_literals: JsonDict, tokenized_question: List[Token]) -> List[int]: res = [] # Hackily access last two feature extractors for table field (span overlaps which don't # depend on the actual table information) features = table_field._feature_extractors[8:] # pylint: disable=protected-access for i, token in enumerate(tokenized_question): tag_best = 0 score_max = 0.0 for tag_index, tag in enumerate(entities): literals = entity_literals.get(tag, []) if not isinstance(literals, list): literals = [literals] for literal in literals: tag_tokens = self._tokenizer.tokenize(literal.lower()) scores = [fe(tag, tag_tokens, token, i, tokenized_question) for fe in features] # Small tie breaker in favor of longer sequences score = max(scores) + len(tag_tokens)/100 if score > score_max and score >= 0.5: tag_best = tag_index + 1 score_max = score res.append(tag_best) return res
def _get_entity_tags(self, entities: List[str], table_field: KnowledgeGraphField, entity_literals: JsonDict, tokenized_question: List[Token]) -> List[int]: res = [] # Hackily access last two feature extractors for table field (span overlaps which don't # depend on the actual table information) features = table_field._feature_extractors[8:] # pylint: disable=protected-access for i, token in enumerate(tokenized_question): tag_best = 0 score_max = 0.0 for tag_index, tag in enumerate(entities): literals = entity_literals.get(tag, []) if not isinstance(literals, list): literals = [literals] for literal in literals: tag_tokens = self._tokenizer.tokenize(literal.lower()) scores = [ fe(tag, tag_tokens, token, i, tokenized_question) for fe in features ] # Small tie breaker in favor of longer sequences score = max(scores) + len(tag_tokens) / 100 if score > score_max and score >= 0.5: tag_best = tag_index + 1 score_max = score res.append(tag_best) return res
def labeled_json_to_labeled_instances( self, json_dict: JsonDict) -> Dict[int, Instance]: seq_offset = 0 seq_len = -1 adhoc_vocab = Vocabulary() instances = {} for i, str_i in sorted(map((lambda x: (int(x), x)), json_dict.keys())): inst_obj = json_dict[str_i] if seq_len == -1: seq_len = len(inst_obj['words']) text_field = TextField( [Token(tok['text']) for tok in inst_obj['words']], {}) instance = Instance({'tokens': text_field}) new_instance = instance.duplicate() tags_field = ConstructiveSupertagField( [json_to_cat(tag) for tag in inst_obj['tags']], text_field, [i - seq_offset]) adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels') new_instance.add_field('tags', tags_field) new_instance.index_fields(adhoc_vocab) instances[i] = new_instance if i + 1 - seq_offset == seq_len: seq_offset += seq_len seq_len = -1 return instances
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: premise = json_dict.get('premise', None) hypothesis = json_dict.get('hypothesis', None) premise_entities = json_dict.get('premise_entities', None) hypothesis_entities = json_dict.get('hypothesis_entities', None) instance = self._dataset_reader.text_to_instance( premise=premise, hypothesis=hypothesis, premise_entities=premise_entities, hypothesis_entities=hypothesis_entities) label_dict = self._model.vocab.get_index_to_token_vocabulary('labels') all_labels = [label_dict[i] for i in range(len(label_dict))] return instance, {"all_labels": all_labels}
def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use """ If you don't want your outputs in JSON-lines format you can override this function to output them differently. """ if 'beam_sql_query' in outputs.keys(): return outputs['predicted_sql_query'] + "\n" + outputs['beam_sql_query'] + "\n" else: return outputs['predicted_sql_query'] + "\n"
def dump_line(self, output: JsonDict) -> str: # pylint: disable=no-self-use output.pop('class_probabilities', None) output['hierplane_tree'].pop('linkNameToLabel', None) output['hierplane_tree'].pop('nodeTypeToStyle', None) tree = output['hierplane_tree'] # Spans are 4-tuple with (start, end (exclusive), span_text, span_label) spans = self.get_parse_spans(tree, []) sentence_id = None if "sentence_id" in output['metadata']: sentence_id = output['metadata']['sentence_id'] tokens = output['metadata']['tokens'] output_jsonl_dict = { 'sentence_id': sentence_id, 'tokens': tokens, 'spans': spans } return json.dumps(output_jsonl_dict) + "\n"
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. """ if 'text_idx' in json_dict: x = (json_dict['text_idx'], json_dict['comment_idx'] ) # , json_dict['comment_idx']) return self._dataset_reader.text_to_instance(*x) # type: ignore return self._dataset_reader.text_to_instance(*json_dict.values())
def predict_json(self, inputs: JsonDict) -> JsonDict: """ We need to override this because of the interactive beam search aspects. """ instance = self._json_to_instance(inputs) # Get the rules out of the instance index_to_rule = [ production_rule_field.rule for production_rule_field in instance.fields["actions"].field_list ] rule_to_index = {rule: i for i, rule in enumerate(index_to_rule)} # A sequence of strings to force, then convert them to ints initial_tokens = inputs.get("initial_sequence", []) # Want to get initial_sequence on the same device as the model. initial_sequence = torch.tensor( [rule_to_index[token] for token in initial_tokens], device=next(self._model.parameters()).device, ) # Replace beam search with one that forces the initial sequence original_beam_search = self._model._beam_search interactive_beam_search = original_beam_search.constrained_to( initial_sequence) self._model._beam_search = interactive_beam_search # Now get results results = self.predict_instance(instance) # And add in the choices. Need to convert from idxs to rules. results["choices"] = [[ (probability, action) for probability, action in zip( pa["action_probabilities"], pa["considered_actions"]) ] for pa in results["predicted_actions"]] results["beam_snapshots"] = { # For each batch_index, we get a list of beam snapshots batch_index: [ # Each beam_snapshots consists of a list of timesteps, # each of which is a list of pairs (score, sequence). # The sequence is the *indices* of the rules, which we # want to convert to the string representations. [(score, [index_to_rule[idx] for idx in sequence]) for score, sequence in timestep_snapshot] for timestep_snapshot in beam_snapshots ] for batch_index, beam_snapshots in interactive_beam_search.beam_snapshots.items() } # Restore original beam search self._model._beam_search = original_beam_search return results
def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]: sentence = json_dict["sentence"] if "verbs" in json_dict.keys(): text = sentence.split() pos = ["VERB" if i == json_dict["verbs"] else "NOUN" for i, _ in enumerate(text)] tokens = [Token(t, i, i + len(text), pos_=p) for i, (t, p) in enumerate(zip(text, pos))] else: tokens = self._tokenizer.tokenize(sentence) return self.tokens_to_instances(tokens)
def convert_qajson_to_entailment(qa_json: JsonDict): question_text = qa_json["question"]["stem"] choices = qa_json["question"]["choices"] for choice in choices: choice_text = choice["text"] statement = create_hypothesis(get_fitb_from_question(question_text), choice_text) create_output_dict(qa_json, statement, choice["label"] == qa_json.get("answerKey", "Z")) return qa_json
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"question": "...", "passage": "..."}``. """ question_text = json_dict["question"] background = json_dict["background"] situation = json_dict.get("situation") return self._dataset_reader.text_to_instance(question_text, background, situation)
def align_entities(extracted: List[str], literals: JsonDict, stemmer: NltkPorterStemmer) -> List[str]: """ Use stemming to attempt alignment between extracted world and given world literals. If more words align to one world vs the other, it's considered aligned. """ literal_keys = list(literals.keys()) literal_values = list(literals.values()) overlaps = [get_stem_overlaps(extract, literal_values, stemmer) for extract in extracted] worlds = [] for overlap in overlaps: if overlap[0] > overlap[1]: worlds.append(literal_keys[0]) elif overlap[0] < overlap[1]: worlds.append(literal_keys[1]) else: worlds.append(None) return worlds
def attack_from_json(self, inputs: JsonDict = None) -> JsonDict: _volatile_json_ = inputs.copy() raw_instance = self.predictor.json_to_labeled_instances(inputs)[0] raw_tokens = list(map(lambda x: x.text, self.spacy.tokenize(inputs[self.f2c]))) # Select words that can be changed sids_to_change = [] nbr_dct = defaultdict(lambda: []) for i in range(len(raw_tokens)): if raw_tokens[i] not in self.ignore_tokens: word = raw_tokens[i] nbrs = self.searcher.search(word) nbrs = [nbr for nbr in nbrs if nbr not in self.forbidden_tokens] if len(nbrs) > 0: sids_to_change.append(i) nbr_dct[i] = nbrs # max number of tokens that can be changed max_change_num = min(self.max_change_num(len(raw_tokens)), len(sids_to_change)) # Construct adversarial instances adv_jsons = [] for i in range(self.search_num): adv_tokens = [ele for ele in raw_tokens] word_sids = random.choices(sids_to_change, k=max_change_num) for word_sid in word_sids: adv_tokens[word_sid] = random.choice(nbr_dct[word_sid]) _volatile_json_[self.f2c] = " ".join(adv_tokens) adv_jsons.append(_volatile_json_.copy()) # Checking attacking status, early stop successful = False results = self.predictor.predict_batch_json(adv_jsons) for i, result in enumerate(results): adv_instance = self.predictor._json_to_instance(adv_jsons[i]) adv_instance = self.predictor.predictions_to_labeled_instances( adv_instance, result)[0] if adv_instance[self.f2a].label != raw_instance[self.f2a].label: successful = True break adv_tokens = adv_jsons[i][self.f2c].split(" ") outputs = result return sanitize({ "adv": adv_tokens, "raw": raw_tokens, "outputs": outputs, "success": 1 if successful else 0 })
def attack(self, attacker_id: str, attack: JsonDict) -> JsonDict: """ Modifies the input (e.g. by adding or removing tokens) to try to change the model's predicti$ in some desired manner. """ if attacker_id not in config.VALID_ATTACKERS: raise http.UnknownAttackerError(attacker_id) attacker = self.attackers.get(attacker_id) if attacker is None: raise http.InvalidAttackerError(attacker_id) print('attack', attack) inputs = attack['inputs'] input_field_to_attack = attack.get('input_field_to_attack', 'tokens') grad_input_field = attack.get('grad_input_field', 'grad_input_1') ignore_tokens = attack.get('ignore_tokens', None) target = attack.get('target', None) if target is not None: raise ValueError( "Input reduction does not implement targeted attacks") ignore_tokens = ["@@NULL@@" ] if ignore_tokens is None else ignore_tokens original_instances = self.predictor.labeled_json_to_labeled_instances( inputs) final_tokens = {} original_tokens = {} for idx, instance in sorted(original_instances.items()): final_tokens[idx] = (attacker._attack_instance( inputs, instance, input_field_to_attack, grad_input_field, ignore_tokens)) original_tokens[idx] = deepcopy( instance[input_field_to_attack].tokens) return sanitize({"final": final_tokens, "original": original_tokens})
def dump_line(self, outputs: JsonDict) -> str: if not self.numeric: prediction = outputs["label"] else: prediction = outputs["prediction"] if isinstance(prediction, float): prediction = min(max(prediction, 0), 5) prediction = f"{prediction:.3f}" output = { "idx": int(outputs["index"]), # "label": prediction, "pseudolabel": outputs["logits"], **outputs.get("raw_input", {}) } return json.dumps(output, ensure_ascii=False) + "\n"
def predict_json(self, inputs: JsonDict) -> JsonDict: n = inputs.pop('n', 10) if 'track_id' in inputs: if self.index is None: raise AttributeError("Please build an index before searching by track.") idx = self.vocab.get_token_to_index_vocabulary("labels")[inputs['track_id']] nns = self.index.get_nns_by_item(idx, n+1)[1:] #scores = self.index.get_item_vector(idx) tracks = self.neighbors_to_tracks(nns) return tracks #return {'tracks': tracks, 'scores': scores} instance = self._json_to_instance(inputs) output_dict = self.predict_instance(instance) output_dict['inputs'] = inputs if self.index: logits = output_dict.get('logits') nns = self.index.get_nns_by_vector(logits, n) return self.neighbors_to_tracks(nns) #output_dict['tracks'] = self.neighbors_to_tracks(nns) return output_dict
def get_most_important_part(self, instance: Instance, output: JsonDict): if "alphas" not in output: return output alphas = output.pop("alphas") best_span = argmax(alphas) tokens = instance["tokens"].tokens nom = 0 span = (0, 0) length = len(tokens) flag = False for i in range(1, length - 1): for j in range(i, length - 1): span = (i, j) if nom == best_span: flag = True break nom += 1 i, j = span best_tokens = tokens[i:j + 1] output["best_span"] = " ".join([token.text for token in best_tokens]) output["nom"] = nom output["ij"] = [i, j] output["break"] = flag output["val"] = alphas[best_span]
def sanitize(result: JsonDict) -> JsonDict: return {key: value for key, value in result.items() if key.startswith("best_span")}