Пример #1
0
def print_param():
    p_base = load_param(get_bert_full_path())
    nli_path = "C:\work\Code\Chair\output\model\\runs\\nli_model.ckpt-75000_NLI\\model-0"
    p_ft = load_param(nli_path)
    keys = list(p_base.keys())

    key = "bert/encoder/layer_0/output/dense/kernel"
    param1 = p_base[key]
    param2 = p_ft[key]
    html = HtmlVisualizer("bert_dense_param.html")

    l , c = param1.shape

    s_score = 100
    for i in range(l):
        rows = []
        row1 = []
        row2 = []
        s_score = 100 - s_score
        score = s_score
        for j in range(c):
            score = 100 - score
            row1.append(Cell("{0:.4f}".format(param1[i, j]), score))
            row2.append(Cell("{0:.4f}".format(param2[i, j]), score))
        rows.append(row1)
        rows.append(row2)
        html.write_table(rows)
Пример #2
0
def main(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://localhost:36559/document?identifier="
    rows = []
    for query_id in keys[:10]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = remove_duplicate(list([e.doc_id
                                                    for e in entries]))
        claim = claim_d[int(query_id)]
        s = "{} : {}".format(query_id, claim)
        rows.append([Cell(s)])
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            s = "<a href=\"{}\">{}</a>".format(url, doc_id)
            rows.append([Cell(s)])

    html = HtmlVisualizer("claim_docs_urls.html")
    html.write_table(rows)
Пример #3
0
def view_grad_overlap_hidden():
    filename = "ukp_feature_overlap.pickle"
    obj = pickle.load(open(os.path.join(output_path, filename), "rb"))

    out_name = filename.split(".")[0] + ".html"
    html_writer = HtmlVisualizer(out_name, dark_mode=False)
    data = EstimatorPredictionViewerGosford(filename)

    for inst_i, entry in enumerate(data):
        tokens = entry.get_mask_resolved_input_mask_with_input()
        h_overlap = entry.get_vector('h_overlap')

        std = np.std(h_overlap, axis=2)
        mean = np.mean(h_overlap, axis=2)
        h_overlap = np.sum(h_overlap, axis=2)

        highlight = lmap(is_mask, tokens)
        cells = data.cells_from_tokens(tokens, highlight)
        rows = [cells]
        for layer_i in range(12):
            e = h_overlap[layer_i, :]
            e = [v * 1e6 for v in e]
            cells = data.cells_from_scores(e)
            rows.append(cells)

            e = [v * 1e8 for v in std[layer_i, :]]
            cells2 = data.cells_from_scores(e)
            rows.append(cells2)

        print(entry.get_vector("masked_lm_example_loss"))
        html_writer.multirow_print_from_cells_list(rows, 40)
Пример #4
0
def main(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_text_d = json.load(open(config['query_text_d']))
    save_name = config['save_path']

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://localhost:36559/document?identifier="
    rows = []
    for query_id in keys[:100]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = list([e.doc_id for e in entries])
        query_text = query_text_d[query_id]
        s = "{} : {}".format(query_id, query_text)
        rows.append([Cell(s)])
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            s = "<a href=\"{}\">{}</a>".format(url, doc_id)
            rows.append([Cell(s)])

    html = HtmlVisualizer(save_name)
    html.write_table(rows)
Пример #5
0
def show(html_visualizer: HtmlVisualizer,
         features: List[ParagraphClaimPersFeature]):
    print("Cid: ", features[0].claim_pers.cid)
    for f in features:
        html_visualizer.write_paragraph("Claim: " + f.claim_pers.claim_text)
        html_visualizer.write_paragraph("Perspective: " + f.claim_pers.p_text)

        pc_tokens: List[str] = nltk.word_tokenize(
            f.claim_pers.claim_text) + nltk.word_tokenize(f.claim_pers.p_text)
        pc_tokens_set = set([t.lower() for t in pc_tokens])
        print(pc_tokens_set)

        def get_cell(token) -> Cell:
            if token.lower() in pc_tokens_set:
                score = 100
            else:
                score = 0
            return Cell(token, score)

        html_visualizer.write_paragraph("Label : {}".format(
            f.claim_pers.label))
        for score_paragraph in f.feature:
            paragraph = score_paragraph.paragraph
            cells = [get_cell(t) for t in paragraph.tokens]
            html_visualizer.write_paragraph("---")
            html_visualizer.multirow_print(cells, width=20)
Пример #6
0
def main():
    html = HtmlVisualizer("tooltip_test.html",
                          dark_mode=False,
                          use_tooltip=True)

    line = [("1", "hello"), ("2", "word")]
    html.write_span_line(line)
    html.write_span_line(line)
Пример #7
0
def visualize_prediction_data(data_id):
    tokenizer = get_tokenizer()
    num_samples_list = open(
        os.path.join(working_path, "entry_prediction_n", data_id),
        "r").readlines()
    p = os.path.join(working_path, "entry_loss",
                     "entry{}.pickle".format(data_id))
    loss_outputs_list = pickle.load(open(p, "rb"))
    print("Loaded input data")
    loss_outputs = []
    for e in loss_outputs_list:
        loss_outputs.extend(e["masked_lm_example_loss"])
    print("Total of {} loss outputs".format(len(loss_outputs)))
    instance_idx = 0
    feature_itr = load_record_v2(
        os.path.join(working_path, "entry_prediction_tf.done", data_id))
    n = len(num_samples_list)
    n = 100
    html = HtmlVisualizer("entry_prediction.html")
    for i in range(n):
        n_sample = int(num_samples_list[i])
        assert n_sample > 0
        first_inst = feature_itr.__next__()
        feature = Feature2Text(first_inst, tokenizer)

        html.write_headline("Input:")
        html.write_paragraph(feature.get_input_as_text(True, True))
        html.write_headline("Word:" + feature.get_selected_word_text())

        if instance_idx + n_sample >= len(loss_outputs):
            break

        if n_sample == 1:
            continue

        rows = []
        no_dict_loss = loss_outputs[instance_idx]
        row = [Cell(no_dict_loss, 0), Cell("")]
        rows.append(row)
        instance_idx += 1
        for j in range(1, n_sample):
            feature = Feature2Text(feature_itr.__next__(), tokenizer)
            def_cell = Cell(feature.get_def_as_text())
            loss = loss_outputs[instance_idx]
            hl_score = 100 if loss < no_dict_loss * 0.9 else 0
            row = [Cell(loss, hl_score), def_cell]
            rows.append(row)
            instance_idx += 1

        html.write_table(rows)
Пример #8
0
def loss_view(dir_path):
    tokenizer = get_tokenizer()
    html_writer = HtmlVisualizer("ukp_lm_grad_high.html", dark_mode=False)

    for file_path in get_dir_files(dir_path):
        items = pickle.load(open(file_path, "rb"))

        for e in items:
            input_ids, masked_input_ids, masked_lm_example_loss = e
            tokens = mask_resolve_1(
                tokenizer.convert_ids_to_tokens(input_ids),
                tokenizer.convert_ids_to_tokens(masked_input_ids))
            highlight = lmap(is_mask, tokens)

            cells = cells_from_tokens(tokens, highlight)
            html_writer.multirow_print(cells)
Пример #9
0
def main():
    save_name = sys.argv[1]
    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    info_file_path = os.path.join(out_dir, "cppnc_triple_all_dev_info")
    pred_file_path = os.path.join(out_dir, save_name + ".score")
    cid_and_confidences = get_confidence_list_per_cid(info_file_path, pred_file_path)

    rows = []
    for cid, confidenc_list in cid_and_confidences.items():
        row = list()
        row.append(Cell(str(cid)))
        row.extend([Cell("", highlight_score=c*100) for c in confidenc_list])
        rows.append(row)

    html = HtmlVisualizer("confidence.html")
    html.write_table(rows)
Пример #10
0
def main():
    file_path = sys.argv[1]
    name = os.path.basename(file_path)
    viewer = EstimatorPredictionViewer(file_path)
    html = HtmlVisualizer("toke_score_gold.html")
    stopwords = load_stopwords_for_query()

    skip = 10
    for entry_idx, entry in enumerate(viewer):
        if entry_idx % skip != 0:
            continue
        tokens = entry.get_tokens("input_ids")
        input_ids = entry.get_vector("input_ids")
        label_ids = entry.get_vector("label_ids")
        label_ids = np.reshape(label_ids, [-1, 2])
        log_label_ids = np.log(label_ids + 1e-10)
        seg1, seg2 = split_p_h_with_input_ids(tokens, input_ids)

        pad_idx = tokens.index("[PAD]")
        assert pad_idx > 0

        logits = entry.get_vector("logits")
        cells = []
        cells2 = []
        for idx in range(pad_idx):
            probs = label_ids[idx]
            token = tokens[idx]

            score = probs[0]
            color = "B" if score > 0 else "R"
            highlight_score = min(abs(score) * 10000, 100)
            if token in stopwords:
                highlight_score = 0
            if token in seg1:
                highlight_score = 50
                color = "G"

            c = Cell(token,
                     highlight_score=highlight_score,
                     target_color=color)
            cells.append(c)
        html.multirow_print_from_cells_list([cells, cells2])

        if entry_idx > 10000:
            break
Пример #11
0
def main():
    first_list_path = sys.argv[1]
    dir_path = sys.argv[2]
    save_path = sys.argv[3]
    l: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        first_list_path)

    new_entries: Dict[str, List[TrecRankedListEntry]] = l

    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    html = HtmlVisualizer(save_path)
    rows = []
    for e in flat_entries:
        ahref = "<a href=\"./{}/{}.html\">{}</a>".format(
            dir_path, e.doc_id, e.doc_id)
        row = lmap(Cell, [e.query_id, e.rank, e.score, ahref])
        rows.append(row)
    html.write_table(rows)
Пример #12
0
def print_paragraph_feature(pf_list: List[ParagraphFeature],
                            out_path: FilePath):
    html = HtmlVisualizer(out_path)
    for pf in pf_list:
        html.write_paragraph("Text 1: " + pf.datapoint.text1)
        html.write_paragraph("Text 2: " + pf.datapoint.text2)
        for f in pf.feature:
            s = " ".join(f.paragraph.tokens)
            html.write_paragraph(s)

    html.close()
Пример #13
0
def print_file(pred_path):
    grouped = load_prediction(pred_path)
    html_pos = HtmlVisualizer("pc_view_true.html")
    html_neg = HtmlVisualizer("pc_view_false.html")

    item_cnt = 0
    for key in grouped:
        paras: List[Tuple[str, float, Segment]] = grouped[key]

        is_true_arr = list([t[1] > 0.5 for t in paras])
        cnt_true = sum(is_true_arr)
        if cnt_true == len(is_true_arr) or cnt_true == 0:
            continue

        cnt_false = len(is_true_arr) - cnt_true
        idx_false = 0
        idx_true = 0
        item_cnt += 1
        for _, score, tokens in paras:
            is_true = score > 0.5
            html = html_pos if is_true else html_neg
            claim, perspective, paragraph = split_3segments(tokens)
            highlight_terms = set(claim + perspective)
            if is_true:
                html.write_paragraph("{} of {}".format(idx_true, cnt_true))
                idx_true += 1
            else:
                html.write_paragraph("{} of {}".format(idx_false, cnt_false))
                idx_false += 1

            html.write_paragraph("claim : " + pretty_tokens(claim))
            html.write_paragraph("perspective : " + pretty_tokens(perspective))

            def make_cell(subword: Subword):
                if subword in highlight_terms:
                    return Cell(subword, highlight_score=100)
                else:
                    return Cell(subword)

            cells = lmap(make_cell, paragraph)
            html.multirow_print(cells)

        if item_cnt > 100:
            break
Пример #14
0
def main():
    #claim_d = load_train_claim_d()
    html = HtmlVisualizer("doc_relevance_and_value.html")
    rows = []
    data_id = 0
    for query, k_list in load_qk():
        claim_id = query.query_id
        claim_text = query.text

        doc_ids = set([k.doc_id for k in k_list])
        for doc_id in list(doc_ids)[:10]:
            url = os.path.join(output_path, "pc_docs_html", doc_id + ".html")
            a = "<a href=\"{}\">url</a>".format(url)
            #tab_print(data_id, claim_id, doc_id)
            row = [Cell(data_id), Cell(claim_id), Cell(claim_text), Cell(a)]
            rows.append(row)
        data_id += 1

    html.write_table(rows)
Пример #15
0
def analyze_hv(hv_tt, hv_lm, tt_grad, tokenizer):
    batch_size = 16
    seq_len = 200
    hidden_dim = 768
    reshaped_grad = reshape_gradienet(tt_grad, seq_len, hidden_dim, False)

    hv_tt, x_list = reshape(hv_tt)
    hv_lm, x_list = reshape(hv_lm)

    assert len(hv_lm) == len(hv_tt)

    html = HtmlVisualizer("Preserved.html")
    for inst_i in range(len(hv_lm)):
        print("\t", end="")
        tokens = tokenizer.convert_ids_to_tokens(x_list[inst_i])
        for seq_i in range(seq_len):
            token = tokenizer.convert_ids_to_tokens([x_list[inst_i, seq_i]])[0]
            print("{}".format(token), end="\t")
        print()
        scores = []
        for layer_i in range(13):
            if layer_i != 1:
                continue
            layer_no = layer_i
            if layer_no >= 1:
                print("Layer {} :".format(layer_no), end="\t")
            else:
                print("Embedding:", end="\t")
            for seq_i in range(seq_len):
                n_diff_1, n_diff_2 = diff_and_grad(
                    hv_lm[inst_i, layer_i, seq_i], hv_tt[inst_i, layer_i,
                                                         seq_i],
                    reshaped_grad[inst_i, layer_i, seq_i])
                scores.append(n_diff_1)
                print("{}({})".format(n_diff_1, n_diff_2), end="\t")
            print("\n")

        row = []
        for t, s in zip(tokens, scores):
            score = s / hidden_dim * 100
            row.append(Cell(t, score))
        html.write_table([row])
        print("-----------------")
Пример #16
0
def main():
    first_list_path = sys.argv[1]
    dir_path = sys.argv[2]
    save_path = sys.argv[3]
    l: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        first_list_path)

    new_entries: Dict[str, List[TrecRankedListEntry]] = l

    def get_html_path_fn(doc_id):
        return os.path.join(dir_path, "{}.html".format(doc_id))

    doc_id_to_url = load_from_pickle("urls_d")
    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    entries = [
        enrich(e, get_html_path_fn, doc_id_to_url) for e in flat_entries
    ]
    html = HtmlVisualizer(save_path,
                          additional_styles=[
                              get_link_highlight_code(),
                              get_bootstrap_include_source()
                          ])
    rows = []

    head = [
        get_table_head_cell("query"),
        get_table_head_cell("rank"),
        get_table_head_cell("score"),
        get_table_head_cell("doc_id"),
        get_table_head_cell("title", 300),
        get_table_head_cell("url"),
    ]

    for e in entries:
        html_path = os.path.join(dir_path, "{}.html".format(e.doc_id))
        ahref = "<a href=\"{}\" target=\"_blank\">{}</a>".format(
            html_path, e.doc_id)
        elem_list = [e.query_id, e.rank, e.score, ahref, e.title, e.url]
        row = lmap(Cell, elem_list)
        rows.append(row)
    html.write_table_with_class(rows, "table")
Пример #17
0
def show(filename):
    data = EstimatorPredictionViewerGosford(filename)
    html_writer = HtmlVisualizer("token_scoring.html", dark_mode=False)

    correctness = []
    for entry in data:
        tokens = entry.get_tokens("input_ids")
        logits = entry.get_vector("logits")
        masks = entry.get_vector("label_masks")
        ids = entry.get_vector("labels")



        token_row = []
        pred_row = []
        gold_row = []
        rows = [token_row, pred_row, gold_row]

        for idx, token in enumerate(tokens):
            token_cell = Cell(token)
            if token == "[PAD]":
                break
            model_score = logits[idx][0]   
            if masks[idx]:

                correct = (model_score > 0 and ids[idx] > 0) or (model_score < 0 and ids[idx] < 0)
                color = "B" if correct else "R"
                if correct and (model_score > 0 and ids[idx] > 0) :
                    color = "G"
                pred_cell = Cell("{0:.2f}".format(model_score), 100, target_color=color)
                gold_cell = Cell("{0:.2f}".format(ids[idx]), 100, target_color=color)
            else:
                token_cell = Cell(token)
                pred_cell = Cell("")
                gold_cell = Cell("")

            token_row.append(token_cell)
            pred_row.append(pred_cell)
            gold_row.append(gold_cell)

        html_writer.multirow_print_from_cells_list(rows, 20)
Пример #18
0
def draw():
    #name = "pc_para_D_grad"
    name = "pc_para_I_grad"
    #name = "pc_para_H_grad"
    data = EstimatorPredictionViewerGosford(name)
    html_writer = HtmlVisualizer(name + ".html", dark_mode=False)

    for inst_i, entry in enumerate(data):
        tokens = entry.get_tokens("input_ids")
        grad = entry.get_vector("gradient")
        m = min(grad)

        cells = data.cells_from_tokens(tokens)

        for i, cell in enumerate(cells):
            cells[i].highlight_score = min(abs(grad[i]) * 1e4, 255)
            cells[i].target_color = "B" if grad[i] > 0 else "R"
        print(grad)
        prob = softmax(entry.get_vector("logits"))

        pred = np.argmax(prob)

        label = entry.get_vector("labels")
        html_writer.write_paragraph("Label={} / Pred={}".format(str(label), pred))
        html_writer.multirow_print(cells)
Пример #19
0
def draw2(in_file, out_file):
    filename = os.path.join(output_path, in_file)
    data = EstimatorPredictionViewerGosford(filename)
    html_writer = HtmlVisualizer(out_file, dark_mode=False)

    tokenizer = get_tokenizer()
    for inst_i, entry in enumerate(data):
        if inst_i > 100:
            break

        tokens = entry.get_tokens("input_ids")
        # tokens = entry.get_tokens("input_ids")
        prob1 = entry.get_vector("prob1")
        prob2 = entry.get_vector("prob2")
        real_loss1 = entry.get_vector("per_example_loss1")
        real_loss2 = entry.get_vector("per_example_loss2")

        masked_lm_positions = entry.get_vector("masked_lm_positions")

        for i, loc in enumerate(masked_lm_positions):

            tokens[loc] = "[{}:{}]".format(i, tokens[loc])

        html_writer.multirow_print(data.cells_from_tokens(tokens))

        row2 = [Cell("prob1:")] + data.cells_from_anything(prob1)
        row3 = [Cell("prob2:")] + data.cells_from_anything(prob2)
        row4 = [Cell("real_loss1:")] + data.cells_from_anything(real_loss1)
        row5 = [Cell("real_loss2:")] + data.cells_from_anything(real_loss2)
        html_writer.multirow_print_from_cells_list([row2, row3, row4, row5])
Пример #20
0
def view_grad_overlap_per_mask():
    filename = "ukp_lm_probs.pickle"

    out_name = filename.split(".")[0] + ".html"
    html_writer = HtmlVisualizer(out_name, dark_mode=False)
    data = EstimatorPredictionViewerGosford(filename)
    tokenizer = data.tokenizer
    for inst_i, entry in enumerate(data):
        tokens = entry.get_mask_resolved_input_mask_with_input()
        highlight = lmap(is_mask, tokens)
        scores = entry.get_vector("overlap_score")
        pos_list = entry.get_vector("masked_lm_positions")
        probs = entry.get_vector("masked_lm_log_probs")
        probs = np.reshape(probs, [20, -1])
        rows = []
        for score, position, prob in zip(scores, pos_list, probs):
            tokens[position] = "{}-".format(position) + tokens[position]

            row = [Cell(position), Cell(score)]

            for idx in np.argsort(prob)[::-1][:5]:
                term = tokenizer.inv_vocab[idx]
                p = math.exp(prob[idx])
                row.append(Cell(term))
                row.append(Cell(p))
            rows.append(row)

        cells = data.cells_from_tokens(tokens, highlight)
        for score, position in zip(scores, pos_list):
            cells[position].highlight_score = score / 10000 * 255

        html_writer.multirow_print(cells, 20)
        html_writer.write_table(rows)
Пример #21
0
def run():
    tokenizer = get_tokenizer()
    spr = StreamPickleReader("contradiction_prediction")

    html = HtmlVisualizer("contradiction_prediction.html")
    cnt = 0
    while spr.has_next():
        item = spr.get_item()
        e, p = item
        input_ids, _, _ = e
        logit, explain = p
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        p, h = split_p_h_with_input_ids(tokens, input_ids)
        p_score, h_score = split_p_h_with_input_ids(explain, input_ids)

        p_score = normalize(p_score)
        h_score = normalize(h_score)
        p_cells = [Cell("P:")] + cells_from_tokens(p, p_score)
        h_cells = [Cell("H:")] + cells_from_tokens(h, h_score)

        html.write_paragraph(str(logit))
        html.multirow_print(p_cells)
        html.multirow_print(h_cells)

        if cnt > 100:
            break
        cnt += 1
Пример #22
0
def per_doc_score():
    filename = "tlm_view.pickle"
    html_writer = HtmlVisualizer("per_doc_score.html", dark_mode=False)

    data = EstimatorPredictionViewerGosford(filename)
    amp = 20
    small_threshold = 40
    for inst_i, entry in enumerate(data):
        if inst_i > 1000:
            break
        scores = entry.get_vector("priority_score")

        tokens = entry.get_mask_resolved_input_mask_with_input()
        cells = data.cells_from_tokens(tokens)
        if len(cells) < small_threshold:
            continue
        avg_score = average(scores)
        if -0.11 > avg_score > -0.30:
            continue
        print(average(scores))
        html_writer.write_headline(avg_score)
        rows = []
        row = []
        for idx, cell in enumerate(cells):
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []
Пример #23
0
def show(out_file_name, summarized_table: List[Entry]):
    html = HtmlVisualizer(out_file_name)
    tokenizer = get_tokenizer()
    num_print = 0
    for input_ids, prob, contributions in summarized_table:
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        html.write_paragraph("Score : {}".format(prob))
        cells = []
        max_change = 0
        for idx in range(len(input_ids)):
            token = tokens[idx]
            if token == "[PAD]":
                break
            if idx in contributions:
                raw_score = contributions[idx]
                max_change = max(abs(raw_score), max_change)

                score = abs(raw_score) * 100
                color = "R" if raw_score > 0 else "B"
                c = Cell(token, highlight_score=score, target_color=color)
            else:
                c = Cell(token, highlight_score=150, target_color="Gray")
            cells.append(c)

        if max_change < 0.05:
            pass
        else:
            html.multirow_print(cells, 30)
            num_print += 1

    print("printed {} of {}".format(num_print, len(summarized_table)))
Пример #24
0
def show_tfrecord(file_path):

    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(file_path)
    html = HtmlVisualizer(name + ".html")
    for features in itr:
        input_ids = take(features["input_ids"])
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]

        html.write_paragraph("Label : {}".format(label))
        html.write_table([p_cells])
        html.write_table([h_cells])
Пример #25
0
def main():
    html = HtmlVisualizer("tf_rel_filter.html")
    tokenizer = get_tokenizer()

    path = "/mnt/nfs/work3/youngwookim/data/bert_tf/tf_rel_filter_B_dev/"

    def itr():
        for file in get_dir_files(path):
            for item in load_record(file):
                yield item

    for feature in itr():
        write_feature_to_html(feature, html, tokenizer)
Пример #26
0
def main():
    save_name = "alamri_mismatch_all"
    output_d = load_from_pickle(save_name)
    html = HtmlVisualizer("alamri_mismatch.html")
    tokenizer = get_tokenizer()
    logits_grouped_by_layer = output_d["per_layer_logits"]
    num_layers = 12

    def float_arr_to_cell(head, float_arr):
        return [Cell(head)] + lmap(Cell, map(two_digit_float, float_arr))

    def float_arr_to_cell2(head, float_arr):
        return [Cell(head)] + lmap(Cell, map("{0:.4f}".format, float_arr))

    num_data = len(output_d['input_ids'])
    for data_idx in range(num_data)[:100]:

        def get(name):
            return output_d[name][data_idx]

        tokens = tokenizer.convert_ids_to_tokens(get("input_ids"))
        ex_scores = get('ex_scores')
        probs = scipy.special.softmax(get('logits'))

        pred_str = make_prediction_summary_str(probs)

        html.write_paragraph("Prediction: {}".format(pred_str))
        html.write_paragraph("gold label={}".format(get("label")))

        row1 = [Cell("")] + list(
            [Cell(t, int(s * 100)) for t, s in zip(tokens, ex_scores)])
        row2 = float_arr_to_cell("ex_prob", ex_scores)
        for i, s in enumerate(ex_scores):
            if s > 0.5:
                row2[i + 1].highlight_score = 100

        rows = [row1, row2]

        for layer_no in range(num_layers):
            layer_logit = logits_grouped_by_layer[layer_no][data_idx]
            probs = sigmoid(layer_logit)
            row = float_arr_to_cell("layer_{}".format(layer_no), probs[:, 1])
            rows.append(row)

        html.write_table(rows)
Пример #27
0
def per_doc_score():
    filename = "fetch_hidden_dim.pickle"
    html_writer = HtmlVisualizer("preserved.html", dark_mode=False)

    p = os.path.join(output_path, filename)
    raw_data = pickle.load(open(p, "rb"))


    n_skip = 0
    data = EstimatorPredictionViewerGosford(filename)
    for inst_i, entry in enumerate(data):
        if inst_i > 100:
            break
        count_preserved = entry.get_vector("layer_count")
        tokens = entry.get_tokens("input_ids")
        cells = data.cells_from_tokens(tokens)
        valid_parst = count_preserved[:len(cells)]
        avg = np.average(count_preserved)
        row = []
        row2 = []
        #f_print = avg > 20
        f_print = True
        print(avg)
        if f_print:
            html_writer.write_paragraph("Skipped {} articles".format(n_skip))
            n_skip = 0
            for idx, cell in enumerate(cells):
                score = count_preserved[idx] / 728 * 100
                cell.highlight_score = score
                row.append(cell)
                row2.append((Cell(count_preserved[idx], score)))
                if len(row) == 20:
                    html_writer.write_table([row, row2])
                    row = []
                    row2 = []

            html_writer.write_paragraph(str(avg))
        else:
            n_skip += 1
Пример #28
0
def print_as_html(fn):
    examples = load_record(fn)
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    html_output = HtmlVisualizer("out_name.html")

    for feature in examples:
        masked_inputs = feature["input_ids"].int64_list.value
        idx = 0
        step = 512
        while idx < len(masked_inputs):
            slice = masked_inputs[idx:idx + step]
            tokens = tokenizer.convert_ids_to_tokens(slice)
            idx += step
            cells = cells_from_tokens(tokens)
            html_output.multirow_print(cells)
        html_output.write_paragraph("----------")
Пример #29
0
def show_prediction(filename, file_path, correctness_1, correctness_2):

    data = EstimatorPredictionViewerGosford(filename)
    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(filename)
    html = HtmlVisualizer(name + ".html")
    idx = 0
    for entry in data:
        features = itr.__next__()

        input_ids = entry.get_vector("input_ids")
        input_ids2 = take(features["input_ids"])
        assert np.all(input_ids == input_ids2)
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]
        logits = entry.get_vector("logits")
        pred = np.argmax(logits)

        if not correctness_1[idx] or not correctness_2[idx]:
            html.write_paragraph("Label : {} Correct: {}/{}".format(
                label, correctness_1[idx], correctness_2[idx]))
            html.write_table([p_cells])
            html.write_table([h_cells])

        idx += 1
Пример #30
0
def loss_view():
    filename = "sero_pred.pickle"
    p = os.path.join(output_path, filename)
    data = pickle.load(open(p, "rb"))
    print(data[0]["masked_lm_example_loss"].shape)
    print(data[0]["masked_input_ids"].shape)

    html_writer = HtmlVisualizer("sero_pred.html", dark_mode=False)

    data = EstimatorPredictionViewerGosford(filename)
    for inst_i, entry in enumerate(data):
        losses = entry.get_vector("masked_lm_example_loss")
        print(losses)
        tokens = entry.get_tokens("masked_input_ids")
        cells = data.cells_from_tokens(tokens)
        row = []

        for idx, cell in enumerate(cells):
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []

        html_writer.multirow_print(data.cells_from_anything(losses), 20)