def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use # Use json.dumps(outputs) + "\n" to dump a dictionary out_str = "" pattn = outputs["passage_attention"] pattn = myutils.round_all(pattn, 4) psigmoid = outputs["passage_sigmoid"] psigmoid = myutils.round_all(psigmoid, 4) attn_sigm = list(zip(pattn, psigmoid)) passage_count_mean = outputs["count_mean"] count_distribution = outputs["count_distritbuion"] count_answer = outputs["count_answer"] pred_count_idx = outputs["pred_count"] out_str += f"Pattn: {pattn}" + "\n" out_str += f"Psigm: {psigmoid}" + "\n" out_str += f"Pattn_sigm: {attn_sigm}" + "\n" out_str += f"Plen: {len(pattn)}" + "\n" out_str += f"PattnSum: {sum(pattn)}" + "\n" out_str += f"PSigmSum: {sum(psigmoid)}" + "\n" out_str += f"CountMean: {passage_count_mean}" + "\n" out_str += f"CountDist: {count_distribution}" + "\n" out_str += f"CountAnswer: {count_answer}" + "\n" out_str += f"Predicted CountAnswer: {pred_count_idx}" + "\n" out_str += "--------------------------------------------------\n" return out_str
def mostAttendedSpans(attention_vec: torch.FloatTensor, tokens: List[str], span_length=5): """ Visualize an attention vector for a list of tokens Parameters: ---------- attention_vec: Shape: (sequence_length, ) Padded vector containing attention over a sequence question_tokens: List[str] List of tokens in the sequence Returns: -------- """ attention_aslist: List[float] = myutils.round_all( myutils.tocpuNPList(attention_vec), 3) tokens_len = len(tokens) # To remove padded elements attention_aslist: List[float] = attention_aslist[:tokens_len] span2atten = {} for start in range(0, len(tokens) - span_length + 1): end = start + span_length attention_sum = sum(attention_aslist[start:end]) span2atten[(start, end)] = attention_sum sorted_spanattn = myutils.sortDictByValue(span2atten, decreasing=True) sorted_spanattn = myutils.round_all(sorted_spanattn, 3) top_spans = [sorted_spanattn[0][0]] idx = 1 while len(top_spans) < 5 and idx < len(sorted_spanattn): span = sorted_spanattn[idx][0] keep_span = True for in_span in top_spans: if myutils.isSpanOverlap(span, in_span): keep_span = False if keep_span: top_spans.append(span) idx += 1 most_attention_spans = [ " ".join(tokens[start:end]) for (start, end) in top_spans ] attention_values = [span2atten[span] for span in top_spans] out_str = "" for span, attn in zip(most_attention_spans, attention_values): out_str += "{}:{} | ".format(span, myutils.round_all(attn, 3)) out_str = out_str.strip() return out_str
def topProbMassElems(attention: torch.Tensor, support: List[Any], k=5): """ Get the top attended elems. Parameters: ---------- attention: Shape: (padded_support_len) Padded vector containing attention over a sequence support: List[Any] List of len=support_len Returns: -------- complete_attention_vis: str most_attended_vis: String visualization of question attention """ attention_aslist: List[float] = myutils.round_all( myutils.tocpuNPList(attention), 3)[:len(support)] sorted_elem_attn = sorted(zip(support, attention_aslist), key=lambda x: x[1], reverse=True) out_str = "" for i in range(min(k, len(sorted_elem_attn))): out_str += f"{sorted_elem_attn[i][0]}: {sorted_elem_attn[i][1]} || " return out_str.strip()
def listTokensVis(attention_vec: torch.FloatTensor, tokens: List[str]): """ Visualize an attention vector for a list of tokens Parameters: ---------- attention_vec: Shape: (sequence_length, ) Padded vector containing attention over a sequence question_tokens: List[str] List of tokens in the sequence Returns: -------- complete_attention_vis: str most_attended_vis: String visualization of question attention """ attention_aslist: List[float] = myutils.round_all(myutils.tocpuNPList(attention_vec), 3) tokens_len = len(tokens) # To remove padded elements attention_aslist: List[float] = attention_aslist[:tokens_len] complete_attention_vis = "" for token, attn in zip(tokens, attention_aslist): complete_attention_vis += f"{token}|{attn} " # List[(token, attn)] sorted_token_attn = sorted([(x, y)for x, y in zip(tokens, attention_aslist)], key=lambda x: x[1], reverse=True) most_attended_token_attn = sorted_token_attn[:10] most_attended_vis = "Most attended: " for token, attn in most_attended_token_attn: most_attended_vis += f"{token}|{attn} " return complete_attention_vis.strip(), most_attended_vis.strip()
def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use # Use json.dumps(outputs) + "\n" to dump a dictionary out_str = '' metadata = outputs['metadata'] predicted_ans = outputs['predicted_answer'] gold_passage_span_ans = metadata['answer_passage_spans'] if 'answer_passage_spans' in metadata else [] gold_question_span_ans = metadata['answer_question_spans'] if 'answer_question_spans' in metadata else [] # instance_spans_for_all_progs = outputs['predicted_spans'] # best_span = instance_spans_for_all_progs[0] question_id = metadata['question_id'] question = metadata['original_question'] passage = metadata['original_passage'] answer_annotation_dicts = metadata['answer_annotations'] passage_date_values = metadata['passage_date_values'] passage_num_values = metadata['passage_number_values'] passage_year_diffs = metadata['passage_year_diffs'] passage_num_diffs = metadata['passagenum_diffs'] (exact_match, f1_score) = f1metric(predicted_ans, answer_annotation_dicts) out_str += "qid: {}".format(question_id) + '\n' out_str += question + '\n' out_str += passage + '\n' out_str += f'GoldAnswer: {answer_annotation_dicts}' + '\n' out_str += f'GoldPassageSpans:{gold_passage_span_ans} GoldQuesSpans:{gold_question_span_ans}\n' # out_str += f"GoldPassageSpans:{answer_as_passage_spans}" + '\n' # out_str += f"PredPassageSpan: {best_span}" + '\n' out_str += f'PredictedAnswer: {predicted_ans}' + '\n' out_str += f'F1:{f1_score} EM:{exact_match}' + '\n' out_str += f'Dates: {passage_date_values}' + '\n' out_str += f'Nums: {passage_num_values}' + '\n' out_str += f'PassageNumDiffs: {passage_num_diffs}' + '\n' out_str += f'YearDiffs: {passage_year_diffs}' + '\n' logical_forms = outputs["logical_forms"] execution_vals = outputs["execution_vals"] actionseq_scores = outputs["batch_actionseq_scores"] all_predicted_answers = outputs["all_predicted_answers"] if 'logical_forms': for lf, d, ex_vals, progscore in zip(logical_forms, all_predicted_answers, execution_vals, actionseq_scores): ex_vals = myutils.round_all(ex_vals, 1) # Stripping the trailing new line ex_vals_str = self._print_ExecutionValTree(ex_vals, 0).strip() out_str += f"LogicalForm: {lf}\n" out_str += f"Score: {progscore}\n" out_str += f"Answer: {d}\n" out_str += f"ExecutionTree:\n{ex_vals_str}" out_str += f"\n" # NUM_PROGS_TO_PRINT -= 1 # if NUM_PROGS_TO_PRINT == 0: # break out_str += '--------------------------------------------------\n' return out_str
def to_string(): perc_of_forward = collections.defaultdict(float) if "forward" in Profile.timer_dict: forward_time = Profile.timer_dict["forward"] for k, v in Profile.timer_dict.items(): perc = (v / forward_time) * 100.0 perc_of_forward[k] = perc timer_dict = util.round_all(Profile.timer_dict, prec=4) perc_of_forward = util.round_all(perc_of_forward, prec=4) s = "\n------------------------ Profiler Stats ------------------------\n" s += "Scope \t Num_Calls \t TimeElapsed \t Perc_of_Forward\n" for k, v in timer_dict.items(): num_calls = Profile.num_calls[k] perc_forward = perc_of_forward[k] if k in perc_of_forward else 0.0 s += "{} \t {} \t {} seconds \t {} % \n".format( k, num_calls, v, perc_forward) s += "----------------------------------------------------------------\n" return s
def _read(self, file_path: str): # pylint: disable=logging-fstring-interpolation logger.info( f"Making {self._num_training_samples} training examples with:\n" f"max_pdist_length: {self._max_dist_length}\n" f"min_dist_length: {self._min_dist_length}\n" f"max_count:{self._max_count}\n") instances: List[Instance] = [] for i in range(self._num_training_samples): fields: Dict[str, Field] = {} dist_length = random.randint(self._min_dist_length, self._max_dist_length) count_value = random.randint(1, min(self._max_count, dist_length)) number_distribution = [0.0] * dist_length if count_value > 0: indices = random.sample(range(dist_length), count_value) # Add 1.0 to all sampled indices for i in indices: number_distribution[i] += 1.0 if self._withnoise: std_dev = random.uniform(0.01, 0.1) number_distribution = [ x + abs(random.gauss(0, std_dev)) for x in number_distribution ] if self._normalized: attention_sum = sum(number_distribution) number_distribution = [ float(x) / attention_sum for x in number_distribution ] number_distribution = myutil.round_all(number_distribution, 3) print(f"{number_distribution} {count_value}") fields["number_dist"] = ArrayField(np.array(number_distribution), padding_value=-1) fields["count_answer"] = LabelField(count_value, skip_indexing=True) instances.append(Instance(fields)) self.instances_made += 1 print(f"Instances made: {self.instances_made}") return instances
def quesParaSize(input_json): dataset = readDataset(input_json) numparas = 0 numques = 0 maxparalen = 0 maxqueslen = 0 qtype_dist = defaultdict(int) for pid, pinfo in dataset.items(): numparas += 1 passage = pinfo[constants.tokenized_passage] plen = len(passage.split(" ")) maxparalen = plen if plen > maxparalen else maxparalen qa_pairs = pinfo[constants.qa_pairs] for qa in qa_pairs: numques += 1 qlen = len(qa[constants.tokenized_question]) maxqueslen = qlen if qlen > maxqueslen else maxqueslen if constants.qtype in qa: qtype_dist[qa[constants.qtype]] += 1 else: qtype_dist["UNK"] += 1 print("\nCount of QTypes") print(qtype_dist) print() for k, v in qtype_dist.items(): qtype_dist[k] = round_all(100 * (float(v) / numques), 1) print("\nPercentage of QTypes:") print(qtype_dist) print() print(f"Paras: {numparas} MaxParaLen:{maxparalen}") print(f"Questions: {numques} MaxQuesLen:{maxqueslen}")
def preprocess_HowManyYardsCount_ques(dataset): """ Here we make synthetic data for counting questions. Idea is to generate semi-gold passage attentions and count-answer to train the count module. Each question we generate will be UNK (irrelevant), containing: - qytpe and program-supervision -- (count findPassageAttention_FAKE) This findPassageAttention_FAKE will not take question-attention as input, in fact the gold passage-attn as a side-arg. - passage_attention & count value We will generate semi-gold passage-attentions and count-values. These passage-attentions will be used in the program above. We generate these questions for passages that contain count-questions """ new_dataset = {} total_ques = 0 num_passages = len(dataset) num_of_gen_ques = 0 count_distribution = defaultdict(int) for passage_id, passage_info in dataset.items(): new_qa_pairs = [] for question_answer in passage_info[constants.qa_pairs]: total_ques += 1 original_question = question_answer[constants.cleaned_question] question_lower = original_question.lower() tokenized_ques = question_answer[constants.tokenized_question] tokenized_passage = passage_info[constants.tokenized_passage] passage_tokens = tokenized_passage.split(" ") if any(span in question_lower for span in COUNT_NGRAMS): attention, count, mask = make_count_instance(passage_tokens) if mask == 0: continue answer = question_answer[constants.answer] answer["spans"] = [] answer["number"] = str(count) question_answer[constants.answer_passage_spans] = [] question_answer[constants.answer_question_spans] = [] count_distribution[count] += 1 query_id = question_answer[constants.query_id] query_id += "-synthetic-count" question_answer[constants.query_id] = query_id question_answer[constants.question] = "synthetic count" question_answer[ constants.tokenized_question] = "synthetic count" question_answer[constants.cleaned_question] = "synthetic count" question_answer[constants.question_charidxs] = [0, 10] question_answer[constants.answer_type] = constants.NUM_TYPE question_answer[constants.qtype] = constants.SYN_COUNT_qtype question_answer[constants.program_supervised] = True question_answer[constants.pattn_supervised] = True question_answer[constants.passage_attn_supervision] = attention # Adding this so that the instance remains strongly supervised question_answer[constants.qattn_supervised] = True question_answer[constants.ques_attention_supervision] = [[ 1.0, 1.0 ]] # Single attention vector of size=1 # The final output of the program is enough to train, so no aux loss / execution supervision is needed # Still label as execution_supervised as it requires passing the pattn as side-arg question_answer[constants.exection_supervised] = True new_qa_pairs.append(question_answer) if len(new_qa_pairs) > 0: passage_info[constants.qa_pairs] = new_qa_pairs new_dataset[passage_id] = passage_info num_of_gen_ques += len(new_qa_pairs) for k, v in count_distribution.items(): count_distribution[k] = util.round_all( (float(v) / num_of_gen_ques) * 100, 3) num_passages_after_prune = len(new_dataset) print(f"Passages:{num_passages_after_prune} Questions:{num_of_gen_ques}") print(f"CountDist: {count_distribution}") return new_dataset
def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use # Use json.dumps(outputs) + "\n" to dump a dictionary out_str = "" metadata = outputs["metadata"] predicted_ans = outputs["predicted_answer"] module_debug_infos = outputs["modules_debug_infos"] gold_passage_span_ans = metadata["answer_passage_spans"] if "answer_passage_spans" in metadata else [] gold_question_span_ans = metadata["answer_question_spans"] if "answer_question_spans" in metadata else [] # instance_spans_for_all_progs = outputs['predicted_spans'] # best_span = instance_spans_for_all_progs[0] question_id = metadata["question_id"] question = metadata["original_question"] passage = metadata["original_passage"] passage_tokens = metadata["passage_orig_tokens"] passage_wps = metadata["passage_tokens"] passage_wpidx2tokenidx = metadata["passage_wpidx2tokenidx"] answer_annotation_dicts = metadata["answer_annotations"] passage_date_values = metadata["passage_date_values"] passage_num_values = metadata["passage_number_values"] composed_numbers = metadata["composed_numbers"] passage_year_diffs = metadata["passage_year_diffs"] # passage_num_diffs = metadata['passagenum_diffs'] (exact_match, f1_score) = f1metric(predicted_ans, answer_annotation_dicts) out_str += "qid: {}".format(question_id) + "\n" out_str += question + "\n" out_str += passage + "\n" out_str += f"GoldAnswer: {answer_annotation_dicts}" + "\n" out_str += f"GoldPassageSpans:{gold_passage_span_ans} GoldQuesSpans:{gold_question_span_ans}\n" # out_str += f"GoldPassageSpans:{answer_as_passage_spans}" + '\n' # out_str += f"PredPassageSpan: {best_span}" + '\n' out_str += f"PredictedAnswer: {predicted_ans}" + "\n" out_str += f"F1:{f1_score} EM:{exact_match}" + "\n" if outputs['logical_forms']: out_str += f"Top-Prog: {outputs['logical_forms'][0]}" + "\n" else: out_str += f"Top-Prog: NO PROGRAM FOUND" + "\n" if outputs['batch_actionseq_probs']: out_str += f"Top-Prog-Prob: {outputs['batch_actionseq_probs'][0]}" + "\n" else: out_str += f"Top-Prog-Prob: NO PROGRAM FOUND" + "\n" out_str += f"Dates: {passage_date_values}" + "\n" out_str += f"PassageNums: {passage_num_values}" + "\n" out_str += f"ComposedNumbers: {composed_numbers}" + "\n" # out_str += f'PassageNumDiffs: {passage_num_diffs}' + '\n' out_str += f"YearDiffs: {passage_year_diffs}" + "\n" logical_forms = outputs["logical_forms"] program_probs = outputs["batch_actionseq_probs"] execution_vals = outputs["execution_vals"] program_logprobs = outputs["batch_actionseq_logprobs"] all_predicted_answers = outputs["all_predicted_answers"] if "logical_forms": for lf, d, ex_vals, prog_logprob, prog_prob in zip( logical_forms, all_predicted_answers, execution_vals, program_logprobs, program_probs): ex_vals = myutils.round_all(ex_vals, 1) # Stripping the trailing new line ex_vals_str = self._print_ExecutionValTree(ex_vals, 0).strip() out_str += f"LogicalForm: {lf}\n" out_str += f"Prog_LogProb: {prog_logprob}\n" out_str += f"Prog_Prob: {prog_prob}\n" out_str += f"Answer: {d}\n" out_str += f"ExecutionTree:\n{ex_vals_str}" out_str += f"\n" # NUM_PROGS_TO_PRINT -= 1 # if NUM_PROGS_TO_PRINT == 0: # break # # This is the top scoring program # # List of dictionary where each dictionary contains a single module_name: pattn-value pair # module_debug_info: List[Dict] = module_debug_infos[0] # for module_dict in module_debug_info: # module_name, pattn = list(module_dict.items())[0] # print(module_name) # print(f"{len(pattn)} {len(passage_wpidx2tokenidx)}") # assert len(pattn) == len(passage_wpidx2tokenidx) # # # print(module_debug_infos) # # print(passage_wpidx2tokenidx) out_str += "--------------------------------------------------\n" return out_str