示例#1
0
def show_tfrecord(file_path):

    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(file_path)
    html = HtmlVisualizer(name + ".html")
    for features in itr:
        input_ids = take(features["input_ids"])
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]

        html.write_paragraph("Label : {}".format(label))
        html.write_table([p_cells])
        html.write_table([h_cells])
示例#2
0
def convert_to_unpaired(source_path, output_path):
    def feature_transformer(feature):
        new_features_1 = collections.OrderedDict()
        new_features_2 = collections.OrderedDict()

        def put(feature_name):
            return create_int_feature(take(feature[feature_name]))

        new_features_1["input_ids"] = put("input_ids1")
        new_features_1["input_mask"] = put("input_mask1")
        new_features_1["segment_ids"] = put("segment_ids1")
        new_features_1["label_ids"] = create_int_feature([1])

        new_features_2["input_ids"] = put("input_ids2")
        new_features_2["input_mask"] = put("input_mask2")
        new_features_2["segment_ids"] = put("segment_ids2")
        new_features_2["label_ids"] = create_int_feature([0])

        return new_features_1, new_features_2

    writer = RecordWriterWrap(output_path)
    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        new_features_1, new_features_2 = feature_transformer(feature)
        writer.write_feature(new_features_1)
        writer.write_feature(new_features_2)
    writer.close()
示例#3
0
def tfrecord_convertor(source_path: FilePath, output_path: FilePath,
                       feature_transformer):
    writer = RecordWriterWrap(output_path)
    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        new_features = feature_transformer(feature)
        writer.write_feature(new_features)
    writer.close()
示例#4
0
 def work(self, job_id):
     file_path = os.path.join(self.lm_dir, str(job_id))
     out_path = os.path.join(self.working_dir, str(job_id))
     lm_itr = load_record_v2(file_path)
     random.shuffle(self.tt_entries)
     idx = 0
     writer = RecordWriterWrap(out_path)
     for lm_entry in lm_itr:
         nli_entry = self.tt_entries[idx]
         new_features = combine_feature(lm_entry, nli_entry)
         writer.write_feature(new_features)
示例#5
0
def generate_training_data(data_id):
    num_samples_list = open(
        os.path.join(working_path, "entry_prediction_n", data_id),
        "r").readlines()
    p = os.path.join(working_path, "entry_loss",
                     "entry{}.pickle".format(data_id))
    loss_outputs_list = pickle.load(open(p, "rb"))
    print("Loaded input data")
    loss_outputs = []
    for e in loss_outputs_list:
        loss_outputs.extend(e["masked_lm_example_loss"])
    print("Total of {} loss outputs".format(len(loss_outputs)))
    feature_itr = load_record_v2(
        os.path.join(working_path, "entry_prediction_tf.done", data_id))

    instance_idx = 0
    writer = tf.python_io.TFRecordWriter(
        os.path.join(working_path, "entry_prediction_train", data_id))

    n = len(num_samples_list)
    for i in range(n):
        n_sample = int(num_samples_list[i])
        assert n_sample > 0
        first_inst = feature_itr.__next__()

        if instance_idx + n_sample >= len(loss_outputs):
            break

        if n_sample == 1:
            continue

        no_dict_loss = loss_outputs[instance_idx]
        instance_idx += 1
        all_samples = []
        for j in range(1, n_sample):
            feature = feature_itr.__next__()
            loss = loss_outputs[instance_idx]
            if loss < no_dict_loss * 0.9:
                label = 1
            else:
                label = 0
            new_features = collections.OrderedDict()

            for key in feature:
                new_features[key] = btd.create_int_feature(take(feature[key]))

            new_features["useful_entry"] = btd.create_int_feature([label])

            example = tf.train.Example(features=tf.train.Features(
                feature=new_features))
            writer.write(example.SerializeToString())

    writer.close()
示例#6
0
def visualize(filename, n_item):
    for idx, features in enumerate(load_record_v2(filename)):
        if idx > n_item:
            break
        keys = features.keys()
        for key in keys:
            feature = features[key]
            if feature.int64_list.value:
                values = feature.int64_list.value
            elif feature.float_list.value:
                values = feature.float_list.value
            print("{} : {}".format(key, values[:50]))
示例#7
0
def visualize_prediction_data(data_id):
    tokenizer = get_tokenizer()
    num_samples_list = open(
        os.path.join(working_path, "entry_prediction_n", data_id),
        "r").readlines()
    p = os.path.join(working_path, "entry_loss",
                     "entry{}.pickle".format(data_id))
    loss_outputs_list = pickle.load(open(p, "rb"))
    print("Loaded input data")
    loss_outputs = []
    for e in loss_outputs_list:
        loss_outputs.extend(e["masked_lm_example_loss"])
    print("Total of {} loss outputs".format(len(loss_outputs)))
    instance_idx = 0
    feature_itr = load_record_v2(
        os.path.join(working_path, "entry_prediction_tf.done", data_id))
    n = len(num_samples_list)
    n = 100
    html = HtmlVisualizer("entry_prediction.html")
    for i in range(n):
        n_sample = int(num_samples_list[i])
        assert n_sample > 0
        first_inst = feature_itr.__next__()
        feature = Feature2Text(first_inst, tokenizer)

        html.write_headline("Input:")
        html.write_paragraph(feature.get_input_as_text(True, True))
        html.write_headline("Word:" + feature.get_selected_word_text())

        if instance_idx + n_sample >= len(loss_outputs):
            break

        if n_sample == 1:
            continue

        rows = []
        no_dict_loss = loss_outputs[instance_idx]
        row = [Cell(no_dict_loss, 0), Cell("")]
        rows.append(row)
        instance_idx += 1
        for j in range(1, n_sample):
            feature = Feature2Text(feature_itr.__next__(), tokenizer)
            def_cell = Cell(feature.get_def_as_text())
            loss = loss_outputs[instance_idx]
            hl_score = 100 if loss < no_dict_loss * 0.9 else 0
            row = [Cell(loss, hl_score), def_cell]
            rows.append(row)
            instance_idx += 1

        html.write_table(rows)
示例#8
0
def do_verfiy():
    pred_file_name = "1.pickle"
    record_file_name = "C:\\work\\Code\\Chair\\output\\1"
    p = os.path.join(output_path, pred_file_name)
    data = pickle.load(open(p, "rb"))
    data = flatten_batches(data)
    itr1 = load_record_v2(record_file_name)
    itr2 = data["prob1"]
    print("itr2 len", len(itr2))
    cnt = 0
    for _ in itr1:
        cnt += 1
    print(cnt)
    for _ in itr2:
        cnt -= 1
    print(cnt)
    print(cnt)
示例#9
0
def count_terms(file_path):
    counter = Counter()

    for feature in load_record_v2(file_path):
        input_ids = take(feature["input_ids"])
        alt_emb_mask = take(feature["alt_emb_mask"])

        cur_words = []
        for i in range(len(input_ids)):
            if alt_emb_mask[i]:
                cur_words.append(input_ids[i])
            else:
                if cur_words:
                    sig = " ".join([str(num) for num in cur_words])
                    counter[sig] += 1
                cur_words = []
    return counter
示例#10
0
def verify_alt_emb(source_path, seq_set: List[List[int]]):
    all_tokens: Set[int] = set(flatten(seq_set))

    def check_feature(feature):
        feature_d = {}
        for key in feature:
            v = take(feature[key])
            feature_d[key] = v

        input_ids = feature_d["input_ids"]
        alt_emb_mask = feature_d["alt_emb_mask"]

        for i in range(len(input_ids)):
            if alt_emb_mask[i] and input_ids[i] not in all_tokens:
                print(i, input_ids[i])

    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        check_feature(feature)
示例#11
0
def build_word_tf(continuation_tokens: Set[int], file_path):
    feature_itr = load_record_v2(file_path)
    counter = Counter()
    for feature in feature_itr:
        if not is_real_example(feature):
            continue

        input_ids = take(feature["input_ids"])
        cur_word = []
        for idx, token_id in enumerate(input_ids):
            if token_id in continuation_tokens:
                cur_word.append(token_id)
            else:
                if len(cur_word) > 1:
                    word_sig = " ".join([str(t) for t in cur_word])
                    counter[word_sig] += 1
                cur_word = [token_id]

    return counter
示例#12
0
def get_correctness(filename, file_path):
    itr = load_record_v2(file_path)
    data = EstimatorPredictionViewerGosford(filename)

    correctness = []
    for entry in data:
        features = itr.__next__()

        input_ids = entry.get_vector("input_ids")
        input_ids2 = take(features["input_ids"])
        assert np.all(input_ids == input_ids2)
        label = take(features["label_ids"])[0]
        logits = entry.get_vector("logits")
        pred = np.argmax(logits)

        if pred == label:
            correctness.append(1)
        else:
            correctness.append(0)
    return correctness
示例#13
0
def run(filename, n_item):
    loss_list = []
    loss_list2 = []
    loss_list3 = []
    for idx, features in enumerate(load_record_v2(filename)):
        if idx > n_item:
            break
        keys = features.keys()
        loss1 = features["loss_base"].float_list.value
        loss2 = features["loss_target"].float_list.value
        mask = features["masked_lm_weights"].float_list.value
        print(loss1)

        loss_list.append(
            independent_model(loss1, loss2, mask, proportion_random))
        loss_list2.append(diff_model(loss1, loss2, mask))
        loss_list3.append(independent_model(loss1, loss2, mask, same))

    print("independent (proportion random): ", average(loss_list))
    print("diff  : ", average(loss_list2))
    print("independent (same): ", average(loss_list3))
示例#14
0
def show_prediction(filename, file_path, correctness_1, correctness_2):

    data = EstimatorPredictionViewerGosford(filename)
    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(filename)
    html = HtmlVisualizer(name + ".html")
    idx = 0
    for entry in data:
        features = itr.__next__()

        input_ids = entry.get_vector("input_ids")
        input_ids2 = take(features["input_ids"])
        assert np.all(input_ids == input_ids2)
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]
        logits = entry.get_vector("logits")
        pred = np.argmax(logits)

        if not correctness_1[idx] or not correctness_2[idx]:
            html.write_paragraph("Label : {} Correct: {}/{}".format(
                label, correctness_1[idx], correctness_2[idx]))
            html.write_table([p_cells])
            html.write_table([h_cells])

        idx += 1
示例#15
0
 def __init__(self, working_dir):
     self.working_dir = working_dir
     self.lm_dir = os.path.join(sydney_working_dir, "unmasked_pair_x3")
     tt_path = os.path.join(output_path, "ukp_512", "train_death_penalty")
     self.tt_entries = list(load_record_v2(tt_path))
示例#16
0
def get_dir_all_itr(dir_path):
    for file_path in get_dir_files(dir_path):
        one_itr = load_record_v2(file_path)
        for item in one_itr:
            yield item
示例#17
0
def do_fix(source_path, output_path):
    max_num_seg = 4
    window_size = 512
    seq_length = 512 * max_num_seg
    input_names1 = [
        "input_ids1",
        "segment_ids1",
        "input_mask1",
    ]
    input_names2 = ["input_ids2", "input_mask2", "segment_ids2"]

    def feature_transformer(feature):
        new_features = collections.OrderedDict()

        def put(feature_name):
            return create_int_feature(take(feature[feature_name]))

        for left_right_idx in [1, 2]:
            input_names = [input_names1, input_names2][left_right_idx - 1]
            input_ids = take(feature["input_ids{}".format(left_right_idx)])
            input_masks = take(feature["input_mask{}".format(left_right_idx)])
            cls_loc = []
            last_non_pad = -1
            for i in range(seq_length):
                if input_ids[i] == 101:
                    cls_loc.append(i)

                if input_masks[i]:
                    last_non_pad = i

            assert last_non_pad >= 0
            assert last_non_pad > cls_loc[-1]
            assert len(cls_loc) <= max_num_seg

            num_seg = len(cls_loc)
            input_building = {}
            for name in input_names:
                input_building[name] = []

            for i in range(num_seg):
                st = cls_loc[i]
                ed = cls_loc[i + 1] if i + 1 < num_seg else last_non_pad + 1
                pad_len = window_size - (ed - st)

                for input_name in input_names:
                    arr = take(feature[input_name])
                    seq = arr[st:ed] + pad_len * [0]
                    input_building[input_name].extend(seq)

            n_empty_seg = max_num_seg - num_seg
            for i in range(n_empty_seg):
                for input_name in input_names:
                    input_building[input_name].extend([0] * window_size)

            for input_name in input_names:
                checksum1 = sum(input_building[input_name])
                checksum2 = sum(take(feature[input_name]))
                assert checksum1 == checksum2

            for input_name in input_names:
                new_features[input_name] = create_int_feature(
                    input_building[input_name])

        new_features["data_ids"] = put("data_ids")
        return new_features

    writer = RecordWriterWrap(output_path)
    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        new_features_1 = feature_transformer(feature)
        writer.write_feature(new_features_1)
    writer.close()
示例#18
0
def do():
    pred_file_name = "RLPP_0.pickle"
    pred_file_name = "ukp_rel.pickle"
    record_file_name = "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0"
    record_file_name = "C:\\work\\Code\\Chair\\output\\tf_enc"
    todo = [
        ("RLPP_0.pickle", "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0",
         "RLPP_wiki.html"),
        ("ukp_rel.pickle", "C:\\work\\Code\\Chair\\output\\tf_enc",
         "RLPP_ukp.html")
    ]
    x = []
    y = []
    for pred_file_name, record_file_name, out_name in todo:
        viewer = EstimatorPredictionViewerGosford(pred_file_name)
        html = HtmlVisualizer(out_name)
        itr1 = load_record_v2(record_file_name)
        itr2 = viewer.__iter__()
        cnt = 0
        for features, entry in zip(itr1, itr2):
            cnt += 1
            if cnt > 200:
                break
            input_ids1 = entry.get_tokens("input_ids")
            prob1 = entry.get_vector("prob1")
            prob2 = entry.get_vector("prob2")

            cells = viewer.cells_from_tokens(input_ids1)
            p1_l = []
            p2_l = []
            useful_l = []

            row1 = []
            row2 = []
            row3 = []
            row4 = []
            for j, cell in enumerate(cells):
                p1 = float(prob1[j])
                p2 = float(prob2[j])
                x.append([p1])
                y.append(p2)
                u = useful(p1, p2)
                score = (1 - u) * 100
                cell.highlight_score = score
                row1.append(cell)
                row2.append(Cell(p1, score))
                row3.append(Cell(p2, score))
                row4.append(Cell(u, score))

                p1_l.append(p1)
                p2_l.append(p2)
                useful_l.append(u)
                if len(row1) > 20:
                    rows = [row1, row2, row3, row4]
                    row1 = []
                    row2 = []
                    row3 = []
                    row4 = []
                    html.write_table(rows)

            html.write_paragraph("p1: {}".format(average(p1_l)))
            html.write_paragraph("p2: {}".format(average(p2_l)))
            html.write_paragraph("useful: {}".format(average(useful_l)))

            if average(useful_l) < 0.4:
                html.write_headline("Low Score")

        l = list(zip(x, y))
        random.shuffle(l)
        l = l[:1000]
        x, y = zip(*l)
        lin = LinearRegression()
        lin.fit(x, y)

        poly = PolynomialFeatures(degree=4)
        X_poly = poly.fit_transform(x)
        poly.fit(X_poly, y)
        lin2 = LinearRegression()
        lin2.fit(X_poly, y)
        plt.scatter(x, y, color='blue')

        plt.plot(x, lin2.predict(poly.fit_transform(x)), color='red')
        plt.title('Polynomial Regression')

        plt.show()