예제 #1
0
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        print(query_id, claim_d[int(query_id)])
        entry.sort(key=get_second, reverse=True)
        for word, diff, pos, neg in entry[:100]:
            word = word.strip()
            print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format(
                word, diff, pos, neg))
예제 #2
0
    def get_input_as_text(self,
                          resolve_mask=False,
                          highlight_lookup_word=False):
        if resolve_mask:
            mask_ans = self.get_mask_answer_dict()

        if highlight_lookup_word:
            d_location_ids = self.get_d_location_ids()
            word = self.get_selected_word_text()
            emph_word = "<b>" + word + "</b>"

        tokens = self.tokenizer.convert_ids_to_tokens(self.get_input_ids())
        for i in range(len(tokens)):
            if resolve_mask and tokens[i] == "[MASK]":
                tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i])
            if highlight_lookup_word and i in d_location_ids and i is not 0:
                print(i, emph_word)
                if tokens[i - 1] != emph_word:
                    tokens[i] = emph_word
                else:
                    tokens[i] = "-"

        return tokenization.pretty_tokens(tokens, True)
예제 #3
0
    def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True):
        self.tokenizer = get_tokenizer()

        self.stopwords_as_ids: Set[WordAsID] = set()
        new_d = {}
        if skip_stopwords:
            stopwords = load_stopwords_for_query()
            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                if len(tokens) == 1 and tokens[0] in stopwords:
                    pass
                    self.stopwords_as_ids.add(key)
                else:
                    new_d[key] = d[key]
            d = new_d

        if stem:
            d_raw = defaultdict(list)
            stemmer = Stemmer()

            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                plain_word = pretty_tokens(tokens, True)
                stemmed = stemmer.stem(plain_word)
                d_raw[stemmed].append(d[key])

            new_d: Dict[str, TokenScore] = {}
            for key, items in d_raw.items():
                score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])]
                new_d[key] = score
            d = new_d
            self.stem = True
            self.stemmer = stemmer
            self.log_odd = self.log_odd_w_stem

        self.d = d
        self.smoothing = 0.1
예제 #4
0
def analyze_gradient(data, tokenizer):
    gradients = data['gradients']
    d_input_ids = data['d_input_ids']
    mask_input_ids = data['masked_input_ids']
    masked_lm_positions = data["masked_lm_positions"]

    n_inst, seq_len = mask_input_ids.shape
    n_inst2, def_len = d_input_ids.shape

    assert n_inst == n_inst2

    def_len = 256
    hidden_dim = 768
    reshaped_grad = reshape_gradienet(gradients, n_inst, def_len, hidden_dim)
    print(reshaped_grad.shape)

    n_pred = reshaped_grad.shape[1]

    grad_per_token = np.sum(np.abs(reshaped_grad), axis=3)

    html_writer = HtmlVisualizer("dict_grad.html", dark_mode=False)

    for inst_idx in range(n_inst):
        tokens = tokenizer.convert_ids_to_tokens(mask_input_ids[inst_idx])
        #ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx])
        for i in range(len(tokens)):
            if tokens[i] == "[MASK]":
                tokens[i] = "[MASK_{}]".format(i)
            if tokens[i] == "[SEP]":
                tokens[i] = "[SEP]<br>"
        def_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[inst_idx])
        s = tokenizer_wo_tf.pretty_tokens(tokens)

        lines = []

        grad_total_max = 0
        for pred_idx in range(n_pred):
            row = []
            max_val = max(grad_per_token[inst_idx, pred_idx])
            total = sum(grad_per_token[inst_idx, pred_idx])
            mask_pos = masked_lm_positions[inst_idx, pred_idx]

            if total > grad_total_max:
                grad_total_max = total

            row.append(Cell(mask_pos))
            row.append(Cell(int(total)))

            for def_idx in range(def_len):
                term = def_tokens[def_idx]
                cont_right = def_idx + 1 < def_len and def_tokens[
                    def_idx][:2] == "##"
                cont_left = term[:2] == "##"

                space_left = "&nbsp;" if not cont_left else ""
                space_right = "&nbsp;" if not cont_right else ""

                if term == "[PAD]":
                    break
                if term == "[unused5]":
                    term = "[\\n]"

                score = grad_per_token[inst_idx, pred_idx,
                                       def_idx] / (hidden_dim * 2)
                bg_color = get_color(score)

                row.append(Cell(term, score, not cont_left, not cont_right))
                print("{}({})".format(
                    term, grad_per_token[inst_idx, pred_idx, def_idx]),
                      end=" ")

            lines.append((mask_pos, row))
            print("")
        lines.sort(key=lambda x: x[0])

        s = s.replace("[unused4]", "<b>DictTerm</b>")
        html_writer.write_paragraph(s)

        if grad_total_max > 5000000:
            html_writer.write_headline("HIGH Gradient")

        rows = right(lines)
        html_writer.write_table(rows)

        print("----------")
    html_writer.close()
예제 #5
0
    def create_instances(self, input_path, target_topic, target_seq_length):
        tokenizer = get_tokenizer()
        doc_top_k = 1000

        all_train_data = list(load_record(input_path))
        train_data = []
        for feature in all_train_data:
            input_ids = feature["input_ids"].int64_list.value
            token_id = input_ids[1]
            topic = token_ids_to_topic[token_id]
            if target_topic == topic:
                train_data.append(feature)

        print("Selected {} from {}".format(len(train_data), len(all_train_data)))

        doc_dict = load_tokens_for_topic(target_topic)
        token_doc_list = []
        ranked_list = sydney_get_ukp_ranked_list()[target_topic]
        print("Ranked list contains {} docs, selecting top-{}".format(len(ranked_list), doc_top_k))
        doc_ids = [doc_id for doc_id, _, _ in ranked_list[:doc_top_k]]

        for doc_id in doc_ids:
            doc = doc_dict[doc_id]
            token_doc = pool_tokens(doc, target_seq_length)
            token_doc_list.extend(token_doc)

        ranker = Ranker()
        target_tf_list = lmap(ranker.get_terms, token_doc_list)

        ranker.init_df_from_tf_list(target_tf_list)

        inv_index = collections.defaultdict(list)
        for doc_idx, doc_tf in enumerate(target_tf_list):
            for term in doc_tf:
                if ranker.df[term] < ranker.N * 0.3:
                    inv_index[term].append(doc_idx)


        def get_candidate_from_inv_index(inv_index, terms):
            s = set()
            for t in terms:
                s.update(inv_index[t])
            return s

        source_tf_list = []
        selected_context = []
        for s_idx, feature in enumerate(train_data):
            input_ids = feature["input_ids"].int64_list.value
            topic_seg, sent = split_p_h_with_input_ids(input_ids, input_ids)
            source_tf = ranker.get_terms_from_ids(sent)
            source_tf_list.append(source_tf)
            ranked_list = []
            candidate_docs = get_candidate_from_inv_index(inv_index, source_tf.keys())
            for doc_idx in candidate_docs:
                target_tf = target_tf_list[doc_idx]
                score = ranker.bm25(source_tf, target_tf)
                ranked_list.append((doc_idx, score, target_tf))
            ranked_list.sort(key=lambda x: x[1], reverse=True)
            ranked_list = list(filter_overlap(ranked_list))
            ranked_list = ranked_list[:self.max_context]

            if s_idx < 10:
                print("--- Source sentence : \n", pretty_tokens(tokenizer.convert_ids_to_tokens(sent), True))
                print("-------------------")
                for rank, (idx, score, target_tf) in enumerate(ranked_list):
                    ranker.bm25(source_tf, target_tf, True)
                    print("Rank#{}  {} : ".format(rank, score) + pretty_tokens(token_doc_list[idx], True))
            if s_idx % 100 == 0:
                print(s_idx)
            contexts = list([token_doc_list[idx] for idx, score, _ in ranked_list])
            selected_context.append(contexts)

        for sent_idx, feature in enumerate(train_data):
            contexts = selected_context[sent_idx]
            yield feature, contexts
예제 #6
0
 def ids_to_pretty_text(self, ids):
     tokens = self.tokenizer.convert_ids_to_tokens(ids)
     return tokenization.pretty_tokens(tokens, True)
예제 #7
0
def load_and_visualize():
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    data_id = "1"

    n_list = open(os.path.join(output_path, "lookup_n", data_id),
                  "r").readlines()
    p = os.path.join(output_path, "example_loss.pickle")
    data = pickle.load(open(p, "rb"))
    data = data[0]["masked_lm_example_loss"]

    feature_itr = load_record_v1(
        os.path.join(output_path, "lookup_example", data_id))

    n = len(n_list)
    feature_idx = 0
    html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False)

    for i in range(n):
        n_sample = int(n_list[i])
        rows = []
        assert n_sample > 0
        for j in range(n_sample):
            feature = feature_itr.__next__()

            input_ids = take(feature["input_ids"])
            masked_lm_ids = take(feature["masked_lm_ids"])
            masked_lm_positions = take(feature["masked_lm_positions"])
            input_mask = take(feature["input_mask"])
            selected_word = take(feature["selected_word"])
            d_input_ids = take(feature["d_input_ids"])
            d_location_ids = take(feature["d_location_ids"])

            word_tokens = tokenizer.convert_ids_to_tokens(selected_word)
            word = tokenizer_wo_tf.pretty_tokens((word_tokens))

            emph_word = "<b>" + word + "</b>"

            if j == 0:
                mask_ans = {}
                masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids)
                for pos, id in zip(list(masked_lm_positions), masked_terms):
                    mask_ans[pos] = id

                tokens = tokenizer.convert_ids_to_tokens(input_ids)

            for i in range(len(tokens)):
                if tokens[i] == "[MASK]":
                    tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i])
                if i in d_location_ids and i is not 0:
                    if tokens[i - 1] != emph_word:
                        tokens[i] = emph_word
                    else:
                        tokens[i] = "-"

            def_str = tokenizer_wo_tf.pretty_tokens(
                tokenizer.convert_ids_to_tokens(d_input_ids), True)
            row = list()
            row.append(Cell(word))
            row.append(Cell(data[feature_idx]))
            row.append(Cell(def_str))
            rows.append(row)

            feature_idx += 1

        s = tokenizer_wo_tf.pretty_tokens(tokens, True)
        html_writer.write_paragraph(s)

        html_writer.write_table(rows)

    html_writer.close()
예제 #8
0
def loss_drop_tendency():
    tokenizer = get_tokenizer()
    filename = "ukp_all_loss_1.pickle"
    p = os.path.join(output_path, filename)
    data = pickle.load(open(p, "rb"))

    batch_size, seq_length = data[0]['input_ids'].shape

    keys = list(data[0].keys())
    vectors = {}

    for e in data:
        for key in keys:
            if key not in vectors:
                vectors[key] = []
            vectors[key].append(e[key])

    for key in keys:
        vectors[key] = np.concatenate(vectors[key], axis=0)

    n_instance = len(vectors['input_ids'])
    print("n_instance ", n_instance)
    token_cnt = Counter()
    acc_prob_before = Counter()
    acc_prob_after = Counter()
    num_predictions = len(vectors["grouped_positions"][0][0])

    prev_word = defaultdict(list)
    context = defaultdict(list)

    def bin_fn(v):
        return int(v / 0.05)

    bin_avg_builder = BinAverage(bin_fn)

    for i in range(n_instance):
        tokens = tokenizer.convert_ids_to_tokens(vectors['input_ids'][i])
        positions = vectors["grouped_positions"][i]

        num_trials = len(positions)
        for t_i in range(num_trials):
            for p_i in range(num_predictions):
                loc = vectors["grouped_positions"][i][t_i][p_i]
                loss1 = vectors["grouped_loss1"][i][t_i][p_i]
                loss2 = vectors["grouped_loss2"][i][t_i][p_i]

                # get the term at the target location
                t = combine(tokens[loc - 1], tokens[loc])

                ctx = pretty_tokens(tokens[loc - 5:loc + 4], drop_sharp=False)

                prob_before = math.exp(-loss1)
                prob_after = math.exp(-loss2)

                prev_word[t].append(tokens[loc - 1])
                context[t].append(ctx)
                token_cnt[t] += 1
                acc_prob_before[t] += prob_before
                acc_prob_after[t] += prob_after
                bin_avg_builder.add(prob_before, prob_after)
    bin_avg = bin_avg_builder.all_average()
    infos = []

    for t in token_cnt:
        cnt = token_cnt[t]
        avg_prob_before = acc_prob_before[t] / cnt
        avg_prob_after = acc_prob_after[t] / cnt
        avg_diff = avg_prob_before - avg_prob_after
        e = t, avg_prob_before, avg_prob_after, avg_diff, cnt
        infos.append(e)

    infos = list([e for e in infos if e[4] > 30])
    info_d = {e[0]: e for e in infos}
    relation_extraction_keywords = [
        "founded_by", "located_in", "died_of", "such_as", "or_other",
        "and_other", "is_buried", "was_born"
    ]

    ukp_keywords = [
        "do_you", "the_above", "once_the", "would_save", "therefore_,",
        "they_viewed", "lead_to", "an_increase", "may_not", "was_common",
        "the_number"
    ]

    group_A = relation_extraction_keywords
    group_B = ukp_keywords

    def get_avg_drop_nli(score):
        mapping = [
            0.00592526, 0.046476, 0.06966591, 0.0852695, 0.0819463, 0.11604176,
            0.13313128, 0.14524656, 0.17160586, 0.18507012, 0.19524361,
            0.23223796, 0.23867801, 0.2618752, 0.28670366, 0.31369072,
            0.3431509, 0.39701927, 0.45573084, 0.72065012, 0.99999865
        ]
        st = [
            0.,
            0.05,
            0.1,
            0.15,
            0.2,
            0.25,
            0.3,
            0.35,
            0.4,
            0.45,
            0.5,
            0.55,
            0.6,
            0.65,
            0.7,
            0.75,
            0.8,
            0.85,
            0.9,
            0.95,
            1.,
        ]
        for i, st_i in enumerate(st):
            if st_i <= score < st_i + 0.05:
                return mapping[i]
        assert False

    def get_avg_drop(v):
        bin_id = bin_fn(v)
        return bin_avg[bin_id]

    for t in group_A + group_B:
        if t not in info_d:
            print("Does not exists: ", t)
        else:
            t, avg_prob_before, avg_prob_after, avg_diff, cnt = info_d[t]
            v = get_avg_drop(avg_prob_before)

            if avg_prob_after > v:
                print("RIGHT: {} bf={} af={} av_af={}".format(
                    t, avg_prob_before, avg_prob_after, v))
            else:
                print("LEFT: {} bf={} af={} av_af={}".format(
                    t, avg_prob_before, avg_prob_after, v))

    def entropy(cnt_dict: Counter):
        total = sum(cnt_dict.values())

        ent = 0
        for key, value in cnt_dict.items():
            p = value / total

            ent += -p * math.log(p)
        return ent

    def print_n(e_list, n):
        for e in e_list[:n]:
            t, avg_prob_before, avg_prob_after, avg_diff, cnt = e
            print("{}  ({})".format(t, cnt))
            print("Before : {0:3f}".format(avg_prob_before))
            print("After  : {0:3f}".format(avg_prob_after))
            print("AvgDiff: {0:3f}".format(avg_diff))
            term_stat = Counter(prev_word[e[0]])
            print(term_stat)
            print(context[t])
            print("Entropy: ", entropy(term_stat))

    print(type(infos[0][0]))
    print(type(infos[0][1]))
    print(type(infos[0][2]))

    print("<< Most common >>")
    infos.sort(key=lambda x: x[1], reverse=True)
    print_n(infos, 10)
    print("---------------------")

    infos.sort(key=lambda x: x[3], reverse=True)
    print("<<  Big Drop  >>")
    print_n(infos, 10)
    print("---------------------")

    infos.sort(key=lambda x: x[3], reverse=False)
    print("<< Negative Drop (NLI Improve >>")
    print_n(infos, 10)
    print("---------------------")

    plt.rcParams.update({'font.size': 22})
    fig, ax = plt.subplots()
    infos = list([e for e in infos if e[4] > 30])

    y = list([x[1] for x in infos])
    z = list([x[2] for x in infos])
    fig.set_size_inches(18.5, 10.5)

    ax.scatter(z, y)
    x = np.linspace(0, 1, 1000)
    ax.plot(x, x)

    for i, e in enumerate(infos):
        ax.annotate(e[0], (z[i], y[i]))

    mpld3.show()
예제 #9
0
def main():
    doc_id = "clueweb12-0005wb-96-30750"
    doc = load(BertTokenizedCluewebDoc, doc_id)
    print("doc has {} lines", len(doc))
    print("last line:", pretty_tokens(doc[-1], True))
예제 #10
0
def text_print():
    d = get_tokens()
    for key, tokens in d:
        print(key)
        print(pretty_tokens(tokens))
예제 #11
0
 def print_ids(ids):
     print(pretty_tokens(tokenizer.convert_ids_to_tokens(ids), True))