예제 #1
0
    def dump_line(self, outputs: JsonDict) -> str:  # pylint: disable=no-self-use
        # Use json.dumps(outputs) + "\n" to dump a dictionary

        out_str = ""
        pattn = outputs["passage_attention"]
        pattn = myutils.round_all(pattn, 4)
        psigmoid = outputs["passage_sigmoid"]
        psigmoid = myutils.round_all(psigmoid, 4)

        attn_sigm = list(zip(pattn, psigmoid))

        passage_count_mean = outputs["count_mean"]
        count_distribution = outputs["count_distritbuion"]
        count_answer = outputs["count_answer"]
        pred_count_idx = outputs["pred_count"]

        out_str += f"Pattn: {pattn}" + "\n"
        out_str += f"Psigm: {psigmoid}" + "\n"
        out_str += f"Pattn_sigm: {attn_sigm}" + "\n"
        out_str += f"Plen: {len(pattn)}" + "\n"
        out_str += f"PattnSum: {sum(pattn)}" + "\n"
        out_str += f"PSigmSum: {sum(psigmoid)}" + "\n"
        out_str += f"CountMean: {passage_count_mean}" + "\n"
        out_str += f"CountDist: {count_distribution}" + "\n"
        out_str += f"CountAnswer: {count_answer}" + "\n"
        out_str += f"Predicted CountAnswer: {pred_count_idx}" + "\n"
        out_str += "--------------------------------------------------\n"

        return out_str
예제 #2
0
def mostAttendedSpans(attention_vec: torch.FloatTensor,
                      tokens: List[str],
                      span_length=5):
    """ Visualize an attention vector for a list of tokens

        Parameters:
        ----------
        attention_vec: Shape: (sequence_length, )
            Padded vector containing attention over a sequence
        question_tokens: List[str]
            List of tokens in the sequence

        Returns:
        --------
    """

    attention_aslist: List[float] = myutils.round_all(
        myutils.tocpuNPList(attention_vec), 3)
    tokens_len = len(tokens)
    # To remove padded elements
    attention_aslist: List[float] = attention_aslist[:tokens_len]

    span2atten = {}
    for start in range(0, len(tokens) - span_length + 1):
        end = start + span_length
        attention_sum = sum(attention_aslist[start:end])
        span2atten[(start, end)] = attention_sum

    sorted_spanattn = myutils.sortDictByValue(span2atten, decreasing=True)
    sorted_spanattn = myutils.round_all(sorted_spanattn, 3)

    top_spans = [sorted_spanattn[0][0]]
    idx = 1
    while len(top_spans) < 5 and idx < len(sorted_spanattn):
        span = sorted_spanattn[idx][0]
        keep_span = True
        for in_span in top_spans:
            if myutils.isSpanOverlap(span, in_span):
                keep_span = False

        if keep_span:
            top_spans.append(span)
        idx += 1

    most_attention_spans = [
        " ".join(tokens[start:end]) for (start, end) in top_spans
    ]
    attention_values = [span2atten[span] for span in top_spans]
    out_str = ""
    for span, attn in zip(most_attention_spans, attention_values):
        out_str += "{}:{} | ".format(span, myutils.round_all(attn, 3))
    out_str = out_str.strip()

    return out_str
예제 #3
0
def topProbMassElems(attention: torch.Tensor, support: List[Any], k=5):
    """ Get the top attended elems.

        Parameters:
        ----------
        attention: Shape: (padded_support_len)
            Padded vector containing attention over a sequence
        support: List[Any] List of len=support_len

        Returns:
        --------
        complete_attention_vis: str
        most_attended_vis: String visualization of question attention
    """

    attention_aslist: List[float] = myutils.round_all(
        myutils.tocpuNPList(attention), 3)[:len(support)]

    sorted_elem_attn = sorted(zip(support, attention_aslist),
                              key=lambda x: x[1],
                              reverse=True)
    out_str = ""
    for i in range(min(k, len(sorted_elem_attn))):
        out_str += f"{sorted_elem_attn[i][0]}: {sorted_elem_attn[i][1]} || "

    return out_str.strip()
예제 #4
0
def listTokensVis(attention_vec: torch.FloatTensor, tokens: List[str]):
    """ Visualize an attention vector for a list of tokens

        Parameters:
        ----------
        attention_vec: Shape: (sequence_length, )
            Padded vector containing attention over a sequence
        question_tokens: List[str]
            List of tokens in the sequence

        Returns:
        --------
        complete_attention_vis: str
        most_attended_vis: String visualization of question attention
    """

    attention_aslist: List[float] = myutils.round_all(myutils.tocpuNPList(attention_vec), 3)
    tokens_len = len(tokens)
    # To remove padded elements
    attention_aslist: List[float] = attention_aslist[:tokens_len]

    complete_attention_vis = ""
    for token, attn in zip(tokens, attention_aslist):
        complete_attention_vis += f"{token}|{attn} "

    # List[(token, attn)]
    sorted_token_attn = sorted([(x, y)for x, y in zip(tokens, attention_aslist)], key=lambda x: x[1], reverse=True)
    most_attended_token_attn = sorted_token_attn[:10]
    most_attended_vis = "Most attended: "
    for token, attn in most_attended_token_attn:
        most_attended_vis += f"{token}|{attn} "

    return complete_attention_vis.strip(), most_attended_vis.strip()
예제 #5
0
    def dump_line(self, outputs: JsonDict) -> str:  # pylint: disable=no-self-use
        # Use json.dumps(outputs) + "\n" to dump a dictionary

        out_str = ''
        metadata = outputs['metadata']
        predicted_ans = outputs['predicted_answer']

        gold_passage_span_ans = metadata['answer_passage_spans'] if 'answer_passage_spans' in metadata else []
        gold_question_span_ans = metadata['answer_question_spans'] if 'answer_question_spans' in metadata else []

        # instance_spans_for_all_progs = outputs['predicted_spans']
        # best_span = instance_spans_for_all_progs[0]
        question_id = metadata['question_id']
        question = metadata['original_question']
        passage = metadata['original_passage']
        answer_annotation_dicts = metadata['answer_annotations']
        passage_date_values = metadata['passage_date_values']
        passage_num_values = metadata['passage_number_values']
        passage_year_diffs = metadata['passage_year_diffs']
        passage_num_diffs = metadata['passagenum_diffs']
        (exact_match, f1_score) = f1metric(predicted_ans, answer_annotation_dicts)

        out_str += "qid: {}".format(question_id) + '\n'
        out_str += question + '\n'
        out_str += passage + '\n'

        out_str += f'GoldAnswer: {answer_annotation_dicts}' + '\n'
        out_str += f'GoldPassageSpans:{gold_passage_span_ans}  GoldQuesSpans:{gold_question_span_ans}\n'
        # out_str += f"GoldPassageSpans:{answer_as_passage_spans}" + '\n'

        # out_str += f"PredPassageSpan: {best_span}" + '\n'
        out_str += f'PredictedAnswer: {predicted_ans}' + '\n'
        out_str += f'F1:{f1_score} EM:{exact_match}' + '\n'
        out_str += f'Dates: {passage_date_values}' + '\n'
        out_str += f'Nums: {passage_num_values}' + '\n'
        out_str += f'PassageNumDiffs: {passage_num_diffs}' + '\n'
        out_str += f'YearDiffs: {passage_year_diffs}' + '\n'

        logical_forms = outputs["logical_forms"]
        execution_vals = outputs["execution_vals"]
        actionseq_scores = outputs["batch_actionseq_scores"]
        all_predicted_answers = outputs["all_predicted_answers"]
        if 'logical_forms':
            for lf, d, ex_vals, progscore in zip(logical_forms, all_predicted_answers, execution_vals, actionseq_scores):
                ex_vals = myutils.round_all(ex_vals, 1)
                # Stripping the trailing new line
                ex_vals_str = self._print_ExecutionValTree(ex_vals, 0).strip()
                out_str += f"LogicalForm: {lf}\n"
                out_str += f"Score: {progscore}\n"
                out_str +=  f"Answer: {d}\n"
                out_str += f"ExecutionTree:\n{ex_vals_str}"
                out_str += f"\n"
                # NUM_PROGS_TO_PRINT -= 1
                # if NUM_PROGS_TO_PRINT == 0:
                #     break

        out_str += '--------------------------------------------------\n'

        return out_str
예제 #6
0
    def to_string():
        perc_of_forward = collections.defaultdict(float)
        if "forward" in Profile.timer_dict:
            forward_time = Profile.timer_dict["forward"]
            for k, v in Profile.timer_dict.items():
                perc = (v / forward_time) * 100.0
                perc_of_forward[k] = perc

        timer_dict = util.round_all(Profile.timer_dict, prec=4)
        perc_of_forward = util.round_all(perc_of_forward, prec=4)

        s = "\n------------------------  Profiler Stats  ------------------------\n"
        s += "Scope \t Num_Calls \t TimeElapsed \t Perc_of_Forward\n"
        for k, v in timer_dict.items():
            num_calls = Profile.num_calls[k]
            perc_forward = perc_of_forward[k] if k in perc_of_forward else 0.0
            s += "{} \t {} \t {} seconds \t {} % \n".format(
                k, num_calls, v, perc_forward)
        s += "----------------------------------------------------------------\n"
        return s
    def _read(self, file_path: str):
        # pylint: disable=logging-fstring-interpolation
        logger.info(
            f"Making {self._num_training_samples} training examples with:\n"
            f"max_pdist_length: {self._max_dist_length}\n"
            f"min_dist_length: {self._min_dist_length}\n"
            f"max_count:{self._max_count}\n")

        instances: List[Instance] = []
        for i in range(self._num_training_samples):
            fields: Dict[str, Field] = {}

            dist_length = random.randint(self._min_dist_length,
                                         self._max_dist_length)
            count_value = random.randint(1, min(self._max_count, dist_length))

            number_distribution = [0.0] * dist_length
            if count_value > 0:
                indices = random.sample(range(dist_length), count_value)
                # Add 1.0 to all sampled indices
                for i in indices:
                    number_distribution[i] += 1.0

            if self._withnoise:
                std_dev = random.uniform(0.01, 0.1)
                number_distribution = [
                    x + abs(random.gauss(0, std_dev))
                    for x in number_distribution
                ]

            if self._normalized:
                attention_sum = sum(number_distribution)
                number_distribution = [
                    float(x) / attention_sum for x in number_distribution
                ]

            number_distribution = myutil.round_all(number_distribution, 3)

            print(f"{number_distribution}   {count_value}")

            fields["number_dist"] = ArrayField(np.array(number_distribution),
                                               padding_value=-1)

            fields["count_answer"] = LabelField(count_value,
                                                skip_indexing=True)

            instances.append(Instance(fields))
            self.instances_made += 1

        print(f"Instances made: {self.instances_made}")
        return instances
예제 #8
0
def quesParaSize(input_json):
    dataset = readDataset(input_json)

    numparas = 0
    numques = 0
    maxparalen = 0
    maxqueslen = 0
    qtype_dist = defaultdict(int)

    for pid, pinfo in dataset.items():
        numparas += 1
        passage = pinfo[constants.tokenized_passage]
        plen = len(passage.split(" "))
        maxparalen = plen if plen > maxparalen else maxparalen

        qa_pairs = pinfo[constants.qa_pairs]

        for qa in qa_pairs:
            numques += 1
            qlen = len(qa[constants.tokenized_question])
            maxqueslen = qlen if qlen > maxqueslen else maxqueslen

            if constants.qtype in qa:
                qtype_dist[qa[constants.qtype]] += 1
            else:
                qtype_dist["UNK"] += 1

    print("\nCount of QTypes")
    print(qtype_dist)
    print()

    for k, v in qtype_dist.items():
        qtype_dist[k] = round_all(100 * (float(v) / numques), 1)

    print("\nPercentage of QTypes:")
    print(qtype_dist)
    print()

    print(f"Paras: {numparas}  MaxParaLen:{maxparalen}")
    print(f"Questions: {numques}  MaxQuesLen:{maxqueslen}")
예제 #9
0
def preprocess_HowManyYardsCount_ques(dataset):
    """ Here we make synthetic data for counting questions.
        Idea is to generate semi-gold passage attentions and count-answer to train the count module.

        Each question we generate will be UNK (irrelevant), containing:
            - qytpe and program-supervision -- (count findPassageAttention_FAKE)
                This findPassageAttention_FAKE will not take question-attention as input, in fact the gold passage-attn
                as a side-arg.
            - passage_attention & count value
                We will generate semi-gold passage-attentions and count-values. These passage-attentions will be
                used in the program above.

        We generate these questions for passages that contain count-questions
    """

    new_dataset = {}
    total_ques = 0
    num_passages = len(dataset)

    num_of_gen_ques = 0
    count_distribution = defaultdict(int)

    for passage_id, passage_info in dataset.items():
        new_qa_pairs = []
        for question_answer in passage_info[constants.qa_pairs]:
            total_ques += 1

            original_question = question_answer[constants.cleaned_question]
            question_lower = original_question.lower()
            tokenized_ques = question_answer[constants.tokenized_question]

            tokenized_passage = passage_info[constants.tokenized_passage]
            passage_tokens = tokenized_passage.split(" ")

            if any(span in question_lower for span in COUNT_NGRAMS):
                attention, count, mask = make_count_instance(passage_tokens)
                if mask == 0:
                    continue

                answer = question_answer[constants.answer]
                answer["spans"] = []
                answer["number"] = str(count)

                question_answer[constants.answer_passage_spans] = []
                question_answer[constants.answer_question_spans] = []

                count_distribution[count] += 1

                query_id = question_answer[constants.query_id]
                query_id += "-synthetic-count"
                question_answer[constants.query_id] = query_id

                question_answer[constants.question] = "synthetic count"
                question_answer[
                    constants.tokenized_question] = "synthetic count"
                question_answer[constants.cleaned_question] = "synthetic count"
                question_answer[constants.question_charidxs] = [0, 10]
                question_answer[constants.answer_type] = constants.NUM_TYPE

                question_answer[constants.qtype] = constants.SYN_COUNT_qtype
                question_answer[constants.program_supervised] = True

                question_answer[constants.pattn_supervised] = True
                question_answer[constants.passage_attn_supervision] = attention

                # Adding this so that the instance remains strongly supervised
                question_answer[constants.qattn_supervised] = True
                question_answer[constants.ques_attention_supervision] = [[
                    1.0, 1.0
                ]]  # Single attention vector of size=1

                # The final output of the program is enough to train, so no aux loss / execution supervision is needed
                # Still label as execution_supervised as it requires passing the pattn as side-arg
                question_answer[constants.exection_supervised] = True

                new_qa_pairs.append(question_answer)

        if len(new_qa_pairs) > 0:
            passage_info[constants.qa_pairs] = new_qa_pairs
            new_dataset[passage_id] = passage_info
            num_of_gen_ques += len(new_qa_pairs)

    for k, v in count_distribution.items():
        count_distribution[k] = util.round_all(
            (float(v) / num_of_gen_ques) * 100, 3)

    num_passages_after_prune = len(new_dataset)
    print(f"Passages:{num_passages_after_prune}  Questions:{num_of_gen_ques}")
    print(f"CountDist: {count_distribution}")

    return new_dataset
예제 #10
0
    def dump_line(self, outputs: JsonDict) -> str:  # pylint: disable=no-self-use
        # Use json.dumps(outputs) + "\n" to dump a dictionary

        out_str = ""
        metadata = outputs["metadata"]
        predicted_ans = outputs["predicted_answer"]
        module_debug_infos = outputs["modules_debug_infos"]

        gold_passage_span_ans = metadata["answer_passage_spans"] if "answer_passage_spans" in metadata else []
        gold_question_span_ans = metadata["answer_question_spans"] if "answer_question_spans" in metadata else []

        # instance_spans_for_all_progs = outputs['predicted_spans']
        # best_span = instance_spans_for_all_progs[0]
        question_id = metadata["question_id"]
        question = metadata["original_question"]
        passage = metadata["original_passage"]
        passage_tokens = metadata["passage_orig_tokens"]
        passage_wps = metadata["passage_tokens"]
        passage_wpidx2tokenidx = metadata["passage_wpidx2tokenidx"]
        answer_annotation_dicts = metadata["answer_annotations"]
        passage_date_values = metadata["passage_date_values"]
        passage_num_values = metadata["passage_number_values"]
        composed_numbers = metadata["composed_numbers"]
        passage_year_diffs = metadata["passage_year_diffs"]
        # passage_num_diffs = metadata['passagenum_diffs']
        (exact_match, f1_score) = f1metric(predicted_ans,
                                           answer_annotation_dicts)

        out_str += "qid: {}".format(question_id) + "\n"
        out_str += question + "\n"
        out_str += passage + "\n"

        out_str += f"GoldAnswer: {answer_annotation_dicts}" + "\n"
        out_str += f"GoldPassageSpans:{gold_passage_span_ans}  GoldQuesSpans:{gold_question_span_ans}\n"
        # out_str += f"GoldPassageSpans:{answer_as_passage_spans}" + '\n'

        # out_str += f"PredPassageSpan: {best_span}" + '\n'
        out_str += f"PredictedAnswer: {predicted_ans}" + "\n"
        out_str += f"F1:{f1_score} EM:{exact_match}" + "\n"
        if outputs['logical_forms']:
            out_str += f"Top-Prog: {outputs['logical_forms'][0]}" + "\n"
        else:
            out_str += f"Top-Prog: NO PROGRAM FOUND" + "\n"
        if outputs['batch_actionseq_probs']:
            out_str += f"Top-Prog-Prob: {outputs['batch_actionseq_probs'][0]}" + "\n"
        else:
            out_str += f"Top-Prog-Prob: NO PROGRAM FOUND" + "\n"
        out_str += f"Dates: {passage_date_values}" + "\n"
        out_str += f"PassageNums: {passage_num_values}" + "\n"
        out_str += f"ComposedNumbers: {composed_numbers}" + "\n"
        # out_str += f'PassageNumDiffs: {passage_num_diffs}' + '\n'
        out_str += f"YearDiffs: {passage_year_diffs}" + "\n"

        logical_forms = outputs["logical_forms"]
        program_probs = outputs["batch_actionseq_probs"]
        execution_vals = outputs["execution_vals"]
        program_logprobs = outputs["batch_actionseq_logprobs"]
        all_predicted_answers = outputs["all_predicted_answers"]
        if "logical_forms":
            for lf, d, ex_vals, prog_logprob, prog_prob in zip(
                    logical_forms, all_predicted_answers, execution_vals,
                    program_logprobs, program_probs):
                ex_vals = myutils.round_all(ex_vals, 1)
                # Stripping the trailing new line
                ex_vals_str = self._print_ExecutionValTree(ex_vals, 0).strip()
                out_str += f"LogicalForm: {lf}\n"
                out_str += f"Prog_LogProb: {prog_logprob}\n"
                out_str += f"Prog_Prob: {prog_prob}\n"
                out_str += f"Answer: {d}\n"
                out_str += f"ExecutionTree:\n{ex_vals_str}"
                out_str += f"\n"
                # NUM_PROGS_TO_PRINT -= 1
                # if NUM_PROGS_TO_PRINT == 0:
                #     break

        # # This is the top scoring program
        # # List of dictionary where each dictionary contains a single module_name: pattn-value pair
        # module_debug_info: List[Dict] = module_debug_infos[0]
        # for module_dict in module_debug_info:
        #     module_name, pattn = list(module_dict.items())[0]
        #     print(module_name)
        #     print(f"{len(pattn)}  {len(passage_wpidx2tokenidx)}")
        #     assert len(pattn) == len(passage_wpidx2tokenidx)
        #
        # # print(module_debug_infos)
        # # print(passage_wpidx2tokenidx)

        out_str += "--------------------------------------------------\n"

        return out_str