Exemplo n.º 1
0
def load_bert_like():
    disable_eager_execution()
    model = BertLike()
    sess = init_session()
    #sess.run(tf.compat.v1.global_variables_initializer())
    load_v2_to_v2(sess, get_bert_full_path(), False)

    attention_prob_list, = sess.run([model.attention_probs_list])
    html = HtmlVisualizer("position.html")

    for layer_no, attention_prob in enumerate(attention_prob_list):
        html.write_headline("Layer {}".format(layer_no))
        acc_dict = {}

        zero_scores = [list() for _ in range(12)]

        for loc in range(2, 40, 2):
            print("Source : ", loc)
            for target_loc in range(20):
                offset = target_loc - loc

                print(offset, end=" ")
                for head_idx in range(num_head):
                    key = offset, head_idx
                    if key not in acc_dict:
                        acc_dict[key] = []
                    e = attention_prob[0, head_idx, loc, target_loc]
                    if target_loc != 0:
                        acc_dict[key].append(e)
                    else:
                        zero_scores[head_idx].append(e)
                    print("{0:.2f}".format(e * 100), end=" ")
                print()

        rows = [[Cell("Loc")] + [Cell("Head{}".format(i)) for i in range(12)]]
        for offset in range(-7, +7):
            print(offset, end=" ")
            scores = []
            for head_idx in range(12):
                key = offset, head_idx

                try:
                    elems = acc_dict[key]
                    if len(elems) < 3:
                        raise KeyError

                    avg = average(elems)
                    scores.append(avg)
                    print("{0:.2f}".format(avg * 100), end=" ")
                except KeyError:
                    print("SKIP")
            print()
            rows.append([Cell(offset)] +
                        [Cell(float(v * 100), v * 1000) for v in scores])
        html.write_table(rows)

        html.write_paragraph("Attention to first token")
        zero_scores = [average(l) for l in zero_scores]
        rows = [[Cell("   ")] + [Cell("Head{}".format(i)) for i in range(12)],
                [Cell("   ")] +
                [Cell(float(v * 100), v * 1000) for v in zero_scores]]
        html.write_table(rows)
Exemplo n.º 2
0
def load_and_visualize():
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    data_id = "1"

    n_list = open(os.path.join(output_path, "lookup_n", data_id),
                  "r").readlines()
    p = os.path.join(output_path, "example_loss.pickle")
    data = pickle.load(open(p, "rb"))
    data = data[0]["masked_lm_example_loss"]

    feature_itr = load_record_v1(
        os.path.join(output_path, "lookup_example", data_id))

    n = len(n_list)
    feature_idx = 0
    html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False)

    for i in range(n):
        n_sample = int(n_list[i])
        rows = []
        assert n_sample > 0
        for j in range(n_sample):
            feature = feature_itr.__next__()

            input_ids = take(feature["input_ids"])
            masked_lm_ids = take(feature["masked_lm_ids"])
            masked_lm_positions = take(feature["masked_lm_positions"])
            input_mask = take(feature["input_mask"])
            selected_word = take(feature["selected_word"])
            d_input_ids = take(feature["d_input_ids"])
            d_location_ids = take(feature["d_location_ids"])

            word_tokens = tokenizer.convert_ids_to_tokens(selected_word)
            word = tokenizer_wo_tf.pretty_tokens((word_tokens))

            emph_word = "<b>" + word + "</b>"

            if j == 0:
                mask_ans = {}
                masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids)
                for pos, id in zip(list(masked_lm_positions), masked_terms):
                    mask_ans[pos] = id

                tokens = tokenizer.convert_ids_to_tokens(input_ids)

            for i in range(len(tokens)):
                if tokens[i] == "[MASK]":
                    tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i])
                if i in d_location_ids and i is not 0:
                    if tokens[i - 1] != emph_word:
                        tokens[i] = emph_word
                    else:
                        tokens[i] = "-"

            def_str = tokenizer_wo_tf.pretty_tokens(
                tokenizer.convert_ids_to_tokens(d_input_ids), True)
            row = list()
            row.append(Cell(word))
            row.append(Cell(data[feature_idx]))
            row.append(Cell(def_str))
            rows.append(row)

            feature_idx += 1

        s = tokenizer_wo_tf.pretty_tokens(tokens, True)
        html_writer.write_paragraph(s)

        html_writer.write_table(rows)

    html_writer.close()
Exemplo n.º 3
0
def diff_view():
    tokenizer = get_tokenizer()
    filename = "bert_815.pickle"
    p = os.path.join(output_path, filename)
    data = pickle.load(open(p, "rb"))
    filename = "bfn_3_200_815.pickle"
    p = os.path.join(output_path, filename)
    data2 = pickle.load(open(p, "rb"))

    run_name = "diff"

    batch_size, seq_length = data[0]['masked_input_ids'].shape
    masked_input_ids = []
    input_ids = []
    masked_lm_example_loss = []

    masked_lm_positions = []
    masked_lm_ids = []
    for e in data[:-1]:
        masked_input_ids.append(e["masked_input_ids"])
        input_ids.append(e["input_ids"])
        masked_lm_example_loss.append(
            np.reshape(e["masked_lm_example_loss"], [batch_size, -1]))
        masked_lm_positions.append(e["masked_lm_positions"])
        masked_lm_ids.append(e["masked_lm_ids"])

    masked_lm_example_loss2 = []
    for e in data2[:-1]:
        masked_lm_example_loss2.append(
            np.reshape(e["masked_lm_example_loss"], [batch_size, -1]))

    masked_lm_example_loss2 = np.concatenate(masked_lm_example_loss2)

    input_ids = np.concatenate(input_ids)
    masked_input_ids = np.concatenate(masked_input_ids)
    masked_lm_example_loss = np.concatenate(masked_lm_example_loss)
    masked_lm_positions = np.concatenate(masked_lm_positions)
    masked_lm_ids = np.concatenate(masked_lm_ids)

    html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False)
    n_instance = len(input_ids)
    for inst_idx in range(n_instance):

        tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx])
        ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx])

        ans_keys = dict(
            zip(masked_lm_positions[inst_idx],
                tokenizer.convert_ids_to_tokens(masked_lm_ids[inst_idx])))

        loss_at_loc = {
            p: l
            for l, p in zip(masked_lm_example_loss[inst_idx],
                            masked_lm_positions[inst_idx])
        }
        loss_at_loc2 = {
            p: l
            for l, p in zip(masked_lm_example_loss2[inst_idx],
                            masked_lm_positions[inst_idx])
        }

        score_at_loc = {k: math.exp(-v) for k, v in loss_at_loc.items()}
        score_at_loc2 = {k: math.exp(-v) for k, v in loss_at_loc2.items()}

        def is_dependent(token):
            return len(token) == 1 and not token[0].isalnum()

        cells = []
        for i in range(len(tokens)):
            f_inverse = False
            score = 0
            if tokens[i] == "[MASK]" or i in loss_at_loc:
                tokens[i] = "[{}-{}]".format(i, ans_keys[i])
                score = (score_at_loc2[i] - score_at_loc[i]) * 180
                score = -score
                if score < 0:
                    f_inverse = True
                    score = abs(score)
            if tokens[i] == "[SEP]":
                tokens[i] = "[SEP]<br>"

            if tokens[i] != "[PAD]":
                term = tokens[i]
                cont_left = term[:2] == "##"
                cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##"
                if i + 1 < len(tokens):
                    dependent_right = is_dependent(tokens[i + 1])
                else:
                    dependent_right = False

                dependent_left = is_dependent(tokens[i])

                if cont_left:
                    term = term[2:]

                space_left = "&nbsp;" if not (cont_left
                                              or dependent_left) else ""
                space_right = "&nbsp;" if not (cont_right
                                               or dependent_right) else ""

                if not f_inverse:
                    cells.append(Cell(term, score, space_left, space_right))
                else:
                    cells.append(
                        Cell(term,
                             score,
                             space_left,
                             space_right,
                             target_color="R"))
        #s = tokenization.pretty_tokens(tokens)

        rows = []
        row = []
        for cell in cells:
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []

        loss_infos = []
        for loss, pos in zip(masked_lm_example_loss[inst_idx],
                             masked_lm_positions[inst_idx]):
            loss_infos.append((loss, pos))

        loss_infos.sort(key=lambda x: x[1])

        rows = []
        for loss, pos in loss_infos:
            loss1 = score_at_loc[pos]
            loss2 = score_at_loc2[pos]
            loss_diff = loss1 - loss2
            rows.append((Cell(pos), Cell(loss1), Cell(loss2), Cell(loss_diff)))

        html_writer.write_table(rows)

    html_writer.close()
Exemplo n.º 4
0
def pred_loss_view():
    tokenizer = get_tokenizer()
    filename = "tlm_loss_pred.pickle"
    filename = "tlm_loss_pred_on_dev.pickle"
    p = os.path.join(output_path, filename)
    data = pickle.load(open(p, "rb"))

    batch_size, seq_length = data[0]['input_ids'].shape

    keys = list(data[0].keys())
    vectors = {}

    for e in data:
        for key in keys:
            if key not in vectors:
                vectors[key] = []
            vectors[key].append(e[key])

    for key in keys:
        vectors[key] = np.concatenate(vectors[key], axis=0)

    html_writer = HtmlVisualizer("pred_make_sense_dev.html", dark_mode=False)

    n_instance = len(vectors['input_ids'])
    n_instance = min(n_instance, 100)
    for inst_idx in range(n_instance):
        tokens = tokenizer.convert_ids_to_tokens(
            vectors['input_ids'][inst_idx])
        locations = list(vectors['masked_lm_positions'][inst_idx])

        def is_dependent(token):
            return len(token) == 1 and not token[0].isalnum()

        cells = []
        for i in range(len(tokens)):
            f_same_pred = False
            score = 0
            if i in locations and i != 0:
                i_idx = locations.index(i)
                tokens[i] = "[{}:{}]".format(i_idx, tokens[i])
                pred_diff = vectors['pred_diff'][inst_idx][i_idx]
                gold_diff = vectors['gold_diff'][inst_idx][i_idx]
                pred_label = pred_diff > 0.3
                gold_label = gold_diff > 0.3
                if pred_label:
                    score = 100
                    if gold_label:
                        f_same_pred = True
                else:
                    if gold_label:
                        score = 30
                        f_same_pred = False

            if tokens[i] == "[SEP]":
                tokens[i] = "[SEP]<br>"

            if tokens[i] != "[PAD]":
                term = tokens[i]
                cont_left = term[:2] == "##"
                cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##"
                if i + 1 < len(tokens):
                    dependent_right = is_dependent(tokens[i + 1])
                else:
                    dependent_right = False

                dependent_left = is_dependent(tokens[i])

                if cont_left:
                    term = term[2:]

                space_left = "&nbsp;" if not (cont_left
                                              or dependent_left) else ""
                space_right = "&nbsp;" if not (cont_right
                                               or dependent_right) else ""

                if f_same_pred:
                    cells.append(Cell(term, score, space_left, space_right))
                else:
                    cells.append(
                        Cell(term,
                             score,
                             space_left,
                             space_right,
                             target_color="R"))

        row = []
        for cell in cells:
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []

        row_head = [
            Cell("Index"),
            Cell("P]Prob1"),
            Cell("P]Prob2"),
            Cell("G]Prob1"),
            Cell("G]Prob2"),
            Cell("P]Diff"),
            Cell("G]Diff"),
        ]

        def f_cell(obj):
            return Cell("{:04.2f}".format(obj))

        rows = [row_head]
        pred_diff_list = []
        gold_diff_list = []
        for idx, pos in enumerate(locations):
            if pos == 0:
                break
            pred_diff = vectors['pred_diff'][inst_idx][idx]
            gold_diff = vectors['gold_diff'][inst_idx][idx]
            pred_diff_list.append(pred_diff)
            gold_diff_list.append(gold_diff)

            row = [
                Cell(idx),
                f_cell(vectors['prob1'][inst_idx][idx]),
                f_cell(vectors['prob2'][inst_idx][idx]),
                f_cell(math.exp(-vectors['loss_base'][inst_idx][idx])),
                f_cell(math.exp(-vectors['loss_target'][inst_idx][idx])),
                f_cell(pred_diff),
                f_cell(gold_diff),
            ]
            rows.append(row)

        html_writer.write_table(rows)

        pred_diff = np.average(pred_diff_list)
        gold_diff = np.average(gold_diff_list)
        html_writer.write_paragraph(
            "Average Pred diff ={:04.2f} Observed diff={:04.2f} ".format(
                pred_diff, gold_diff))

        if pred_diff > 0.3:
            html_writer.write_headline("High Drop")
        elif pred_diff < 0.1:
            html_writer.write_headline("Low Drop")
Exemplo n.º 5
0
def work():
    tokenizer = get_tokenizer()
    filename = "bert_815.pickle"
    filename = "bfn_3_200_815.pickle"
    run_name = filename[:-(len(".pickle"))]
    p = os.path.join(output_path, filename)
    data = pickle.load(open(p, "rb"))

    batch_size, seq_length = data[0]['masked_input_ids'].shape
    masked_input_ids = []
    input_ids = []
    masked_lm_example_loss = []
    masked_lm_positions = []
    for e in data[:-1]:
        masked_input_ids.append(e["masked_input_ids"])
        input_ids.append(e["input_ids"])
        masked_lm_example_loss.append(
            np.reshape(e["masked_lm_example_loss"], [batch_size, -1]))
        masked_lm_positions.append(e["masked_lm_positions"])

    input_ids = np.concatenate(input_ids)
    masked_input_ids = np.concatenate(masked_input_ids)
    masked_lm_example_loss = np.concatenate(masked_lm_example_loss)
    masked_lm_positions = np.concatenate(masked_lm_positions)

    html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False)
    n_instance = len(input_ids)
    for inst_idx in range(200):

        tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx])
        ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx])

        loss_at_loc = {
            p: l
            for l, p in zip(masked_lm_example_loss[inst_idx],
                            masked_lm_positions[inst_idx])
        }

        cells = []
        for i in range(len(tokens)):
            score = 0
            if tokens[i] == "[MASK]":
                tokens[i] = "[{}]".format(ans_tokens[i])
                score = loss_at_loc[i] * 255 / 25
            if tokens[i] == "[SEP]":
                tokens[i] = "[SEP]<br>"

            if tokens[i] != "[PAD]":
                cells.append(Cell(tokens[i], score))
        #s = tokenization.pretty_tokens(tokens)

        rows = []
        row = []
        for cell in cells:
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []

        loss_infos = []
        for loss, pos in zip(masked_lm_example_loss[inst_idx],
                             masked_lm_positions[inst_idx]):
            loss_infos.append((loss, pos))

        loss_infos.sort(key=lambda x: x[1])

        rows = []
        for loss, pos in loss_infos:
            rows.append((Cell(pos), Cell(loss)))

        html_writer.write_table(rows)

    html_writer.close()
Exemplo n.º 6
0
def do():
    pred_file_name = "RLPP_0.pickle"
    pred_file_name = "ukp_rel.pickle"
    record_file_name = "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0"
    record_file_name = "C:\\work\\Code\\Chair\\output\\tf_enc"
    todo = [
        ("RLPP_0.pickle", "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0",
         "RLPP_wiki.html"),
        ("ukp_rel.pickle", "C:\\work\\Code\\Chair\\output\\tf_enc",
         "RLPP_ukp.html")
    ]
    x = []
    y = []
    for pred_file_name, record_file_name, out_name in todo:
        viewer = EstimatorPredictionViewerGosford(pred_file_name)
        html = HtmlVisualizer(out_name)
        itr1 = load_record_v2(record_file_name)
        itr2 = viewer.__iter__()
        cnt = 0
        for features, entry in zip(itr1, itr2):
            cnt += 1
            if cnt > 200:
                break
            input_ids1 = entry.get_tokens("input_ids")
            prob1 = entry.get_vector("prob1")
            prob2 = entry.get_vector("prob2")

            cells = viewer.cells_from_tokens(input_ids1)
            p1_l = []
            p2_l = []
            useful_l = []

            row1 = []
            row2 = []
            row3 = []
            row4 = []
            for j, cell in enumerate(cells):
                p1 = float(prob1[j])
                p2 = float(prob2[j])
                x.append([p1])
                y.append(p2)
                u = useful(p1, p2)
                score = (1 - u) * 100
                cell.highlight_score = score
                row1.append(cell)
                row2.append(Cell(p1, score))
                row3.append(Cell(p2, score))
                row4.append(Cell(u, score))

                p1_l.append(p1)
                p2_l.append(p2)
                useful_l.append(u)
                if len(row1) > 20:
                    rows = [row1, row2, row3, row4]
                    row1 = []
                    row2 = []
                    row3 = []
                    row4 = []
                    html.write_table(rows)

            html.write_paragraph("p1: {}".format(average(p1_l)))
            html.write_paragraph("p2: {}".format(average(p2_l)))
            html.write_paragraph("useful: {}".format(average(useful_l)))

            if average(useful_l) < 0.4:
                html.write_headline("Low Score")

        l = list(zip(x, y))
        random.shuffle(l)
        l = l[:1000]
        x, y = zip(*l)
        lin = LinearRegression()
        lin.fit(x, y)

        poly = PolynomialFeatures(degree=4)
        X_poly = poly.fit_transform(x)
        poly.fit(X_poly, y)
        lin2 = LinearRegression()
        lin2.fit(X_poly, y)
        plt.scatter(x, y, color='blue')

        plt.plot(x, lin2.predict(poly.fit_transform(x)), color='red')
        plt.title('Polynomial Regression')

        plt.show()
def write_deletion_score_to_html(out_file_name, summarized_table: List[Entry],
                                 info: Dict[int, Dict]):
    text_to_info = claim_text_to_info()
    html = HtmlVisualizer(out_file_name)
    tokenizer = get_biobert_tokenizer()
    num_print = 0
    for entry in summarized_table:
        tokens = tokenizer.convert_ids_to_tokens(entry.input_ids)
        idx_sep1, idx_sep2 = get_sep_loc(entry.input_ids)
        max_change = 0
        max_drop = 0
        cells = cells_from_tokens(tokens)

        drops = []
        for idx in range(len(tokens)):
            if tokens[idx] == "[PAD]":
                break
            if tokens[idx] == '[SEP]':
                continue

            if idx in entry.contribution:
                raw_score = entry.contribution[idx]
                e = idx, raw_score
                drops.append(e)

        drops.sort(key=get_second)
        _, largest_drop = drops[0]

        max_drop_idx = -1
        max_drop_case_logit = None
        for idx in range(len(tokens)):
            if tokens[idx] == "[PAD]":
                break
            if tokens[idx] == '[SEP]':
                continue
            if idx in entry.contribution:
                raw_score = entry.contribution[idx]

                max_change = max(abs(raw_score), max_change)
                if max_drop > raw_score:
                    max_drop = raw_score
                    max_drop_idx = idx
                    max_drop_case_logit = entry.case_logits_d[idx]

                if raw_score < 0:
                    score = abs(raw_score / largest_drop) * 200
                    color = "B"
                else:
                    score = 0
                    color = "B"
            else:
                score = 150
                color = "Gray"
            cells[idx].highlight_score = score
            cells[idx].target_color = color

        if max_change < 0.05 and False:
            pass
        else:
            # if random.random() < 0.90:
            #     continue
            base_probs = scipy.special.softmax(entry.base_logits)
            info_entry = info[str(entry.data_id[0])]
            claim1_info: Dict = text_to_info[info_entry['text1']]
            claim2_info: Dict = text_to_info[info_entry['text2']]
            question = claim1_info['question']
            assertion1 = claim1_info['assertion']
            assertion2 = claim2_info['assertion']
            original_prediction_summary = make_prediction_summary_str(
                base_probs)
            html.write_bar()
            html.write_paragraph("Question: {}".format(question))
            html.write_paragraph("Original prediction: " +
                                 original_prediction_summary)
            html.write_paragraph("Max drop")

            rows = []
            for idx, score in drops[:5]:
                row = [Cell(str(idx)), Cell(tokens[idx]), Cell(score)]
                rows.append(row)
            html.write_table(rows)

            min_token = tokens[max_drop_idx]
            html.write_paragraph("> \"{}\": {} ".format(min_token, max_drop))
            max_drop_case_prob = scipy.special.softmax(max_drop_case_logit)
            max_drop_prediction_summary = make_prediction_summary_str(
                max_drop_case_prob)
            html.write_paragraph("> " + max_drop_prediction_summary)
            p = [Cell("Claim1 ({}):".format(assertion1))] + cells[1:idx_sep1]
            h = [Cell("Claim2 ({}):".format(assertion2))
                 ] + cells[idx_sep1 + 1:idx_sep2]
            html.write_table([p])
            html.write_table([h])
            num_print += 1

    print("printed {} of {}".format(num_print, len(summarized_table)))