Пример #1
0
def show_tfrecord(file_path):

    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(file_path)
    html = HtmlVisualizer(name + ".html")
    for features in itr:
        input_ids = take(features["input_ids"])
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]

        html.write_paragraph("Label : {}".format(label))
        html.write_table([p_cells])
        html.write_table([h_cells])
Пример #2
0
def combine_segment(features_c, features_p) -> OrderedDict:
    # input_ids does not contain CLS, SEP
    c_seg_id = list(take(features_c['segment_ids']))
    max_seq_len = len(c_seg_id)

    st = c_seg_id.index(1)
    input_mask = list(take(features_c['input_mask']))

    ed = input_mask.index(0)

    feature_c_input_ids = take(features_c['input_ids'])
    paragraph = feature_c_input_ids[st:ed]

    c_input_ids = feature_c_input_ids[:st]

    feature_p_input_ids = take(features_p['input_ids'])
    p_seg_id = list(take(features_p['segment_ids']))

    st = p_seg_id.index(1)
    p_input_ids = feature_p_input_ids[:st]

    input_ids = [CLS_ID] + c_input_ids + p_input_ids + [SEP_ID] + paragraph + [
        SEP_ID
    ]  #+ [random.randint(10, 13)]
    segment_ids = [0] * (2 + len(c_input_ids) +
                         len(p_input_ids)) + [1] * (1 + len(paragraph))
    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids(
        input_ids, segment_ids, max_seq_len)
    return ordered_dict_from_input_segment_mask_ids(input_ids, input_mask,
                                                    segment_ids)
Пример #3
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all_balanced")
    pos_insts = []
    neg_insts = []
    all_insts = [neg_insts, pos_insts]

    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))

                label = take(record['label_ids'])[0]
                all_insts[label].append(new_features)

    random.shuffle(pos_insts)
    random.shuffle(neg_insts)

    num_sel = min(len(pos_insts), len(neg_insts))
    print("{} insts per label".format(num_sel))

    insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel]
    writer = RecordWriterWrap(output_path)
    foreach(writer.write_feature, insts_to_write)
Пример #4
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()

        def put(feature_name):
            return create_int_feature(take(feature[feature_name]))

        for left_right_idx in [1, 2]:
            input_names = [input_names1, input_names2][left_right_idx - 1]
            input_ids = take(feature["input_ids{}".format(left_right_idx)])
            input_masks = take(feature["input_mask{}".format(left_right_idx)])
            cls_loc = []
            last_non_pad = -1
            for i in range(seq_length):
                if input_ids[i] == 101:
                    cls_loc.append(i)

                if input_masks[i]:
                    last_non_pad = i

            assert last_non_pad >= 0
            assert last_non_pad > cls_loc[-1]
            assert len(cls_loc) <= max_num_seg

            num_seg = len(cls_loc)
            input_building = {}
            for name in input_names:
                input_building[name] = []

            for i in range(num_seg):
                st = cls_loc[i]
                ed = cls_loc[i + 1] if i + 1 < num_seg else last_non_pad + 1
                pad_len = window_size - (ed - st)

                for input_name in input_names:
                    arr = take(feature[input_name])
                    seq = arr[st:ed] + pad_len * [0]
                    input_building[input_name].extend(seq)

            n_empty_seg = max_num_seg - num_seg
            for i in range(n_empty_seg):
                for input_name in input_names:
                    input_building[input_name].extend([0] * window_size)

            for input_name in input_names:
                checksum1 = sum(input_building[input_name])
                checksum2 = sum(take(feature[input_name]))
                assert checksum1 == checksum2

            for input_name in input_names:
                new_features[input_name] = create_int_feature(
                    input_building[input_name])

        new_features["data_ids"] = put("data_ids")
        return new_features
Пример #5
0
def combine_feature(lm_entry, nli_entry):
    new_features = collections.OrderedDict()
    for key in lm_entry:
        new_features[key] = create_int_feature(take(lm_entry[key]))
    for key in nli_entry:
        if key == "label_ids":
            new_features[key] = create_int_feature(take(nli_entry[key]))
        else:
            new_key = "nli_" + key
            new_features[new_key] = create_int_feature(take(nli_entry[key]))
    return new_features
Пример #6
0
def write_feature_to_html(feature, html, tokenizer):
    input_ids = take(feature['input_ids'])
    label_ids = take(feature['label_ids'])
    seg1, seg2 = split_p_h_with_input_ids(input_ids, input_ids)
    text1 = tokenizer.convert_ids_to_tokens(seg1)
    text2 = tokenizer.convert_ids_to_tokens(seg2)
    text1 = pretty_tokens(text1, True)
    text2 = pretty_tokens(text2, True)
    html.write_headline("{}".format(label_ids[0]))
    html.write_paragraph(text1)
    html.write_paragraph(text2)
Пример #7
0
 def load(file_no):
     path = os.path.join(data_path, "pc_rel_tfrecord_dev", str(file_no))
     d = {}
     for feature in load_record(path):
         data_id = take(feature["data_id"])[0]
         input_ids = take(feature["input_ids"])
         segment_ids = take(feature["segment_ids"])
         d[data_id] = input_ids, segment_ids
         print(data_id)
     print("loaded {} data".format(len(d)))
     return d
Пример #8
0
def write_feature_to_html(feature, html, tokenizer):
    input_ids = take(feature['input_ids'])
    focus_msak = take(feature['focus_mask'])
    label_ids = take(feature['label_ids'])
    text1 = tokenizer.convert_ids_to_tokens(input_ids)

    row = []
    for i in range(len(input_ids)):
        highlight_score = 100 if focus_msak[i] else 0
        row.append(Cell(text1[i], highlight_score))

    html.write_headline("{}".format(label_ids[0]))
    html.multirow_print(row)
Пример #9
0
def get_paragraph(features_c, features_p) -> Iterator[int]:
    # input_ids does not contain CLS, SEP
    c_seg_id = list(take(features_c['segment_ids']))
    max_seq_len = len(c_seg_id)

    st = c_seg_id.index(1)
    input_mask = list(take(features_c['input_mask']))

    ed = input_mask.index(0)

    feature_c_input_ids = take(features_c['input_ids'])
    paragraph = feature_c_input_ids[st:ed]
    return paragraph
Пример #10
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        success = False
        for key in feature:
            v = take(feature[key])
            if key == "input_ids":
                alt_emb_mask = [0] * len(v)
                s = set(v)
                if len(s.intersection(all_tokens)) >= min_overlap:
                    for word in seq_set:
                        pre_match = 0
                        for i in range(len(v)):
                            if v[i] == word[pre_match]:
                                pre_match += 1
                            else:
                                pre_match = 0
                            if pre_match == len(word):
                                pre_match = 0
                                for j in range(i - len(word) + 1, i + 1):
                                    alt_emb_mask[j] = 1
                                    success = True
                new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask)
            new_features[key] = create_int_feature(v)

        if success:
            return new_features
        else:
            return None
Пример #11
0
def rel_filter_to_para(tfrecord_itr,
                       relevance_scores: Dict[DataID, Tuple[CPIDPair, Logits,
                                                            Logits]],
                       cpid_to_label: Dict[CPIDPair, int]) -> Iterator[Tuple]:

    last_feature = None
    for features in tfrecord_itr:
        if last_feature is None:
            last_feature = features
            continue

        data_id = take(features["data_id"])[0]
        t = relevance_scores[data_id]
        cpid: CPIDPair = t[0]
        c_logits = t[1]
        p_logits = t[2]

        c_score = softmax(c_logits)[1]
        p_score = softmax(p_logits)[1]

        weight = c_score * p_score
        label: int = cpid_to_label[cpid]

        if weight > 0.5:
            paragraph = get_paragraph(last_feature, features)
        else:
            paragraph = []

        output_entry = cpid, label, paragraph, c_score, p_score
        yield output_entry
Пример #12
0
 def debug_call_back(features):
     nonlocal inst_cnt
     if inst_cnt < 4:
         input_tokens = tokenizer.convert_ids_to_tokens(
             take(features['input_ids']))
         print(pretty_tokens(input_tokens))
     inst_cnt += 1
Пример #13
0
def rel_filter(tfrecord_itr,
               relevance_scores: Dict[DataID, Tuple[CPIDPair, Logits, Logits]],
               cpid_to_label: Dict[CPIDPair, int]) -> Iterator[OrderedDict]:

    last_feature = None
    for features in tfrecord_itr:
        if last_feature is None:
            last_feature = features
            continue

        data_id = take(features["data_id"])[0]
        t = relevance_scores[data_id]
        cpid: CPIDPair = t[0]
        c_logits = t[1]
        p_logits = t[2]

        c_score = softmax(c_logits)[1]
        p_score = softmax(p_logits)[1]

        weight = c_score * p_score
        label: int = cpid_to_label[cpid]

        if weight > 0.5:
            new_feature = combine_segment(last_feature, features)
            #new_feature['weight'] = create_float_feature([weight])
            new_feature['label_ids'] = create_int_feature([label])
            new_feature['data_id'] = create_int_feature([data_id])
            yield new_feature
        last_feature = None
Пример #14
0
def count_terms(file_path):
    counter = Counter()

    for feature in load_record_v2(file_path):
        input_ids = take(feature["input_ids"])
        alt_emb_mask = take(feature["alt_emb_mask"])

        cur_words = []
        for i in range(len(input_ids)):
            if alt_emb_mask[i]:
                cur_words.append(input_ids[i])
            else:
                if cur_words:
                    sig = " ".join([str(num) for num in cur_words])
                    counter[sig] += 1
                cur_words = []
    return counter
Пример #15
0
def tfrecord_to_old_stype(tfrecord_path, feature_names: List):
    all_insts = []
    for feature in load_record(tfrecord_path):
        inst = []
        for key in feature_names:
            v = take(feature[key])
            inst.append(list(v))
        all_insts.append(inst)
    return all_insts
Пример #16
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        mapping = {0: 0, 1: 1, 2: 1}

        for key in feature:
            v = take(feature[key])
            if key == "label_ids":
                v = [mapping[v[0]]]
            new_features[key] = create_int_feature(v)

        return new_features
Пример #17
0
def get_correctness(filename, file_path):
    itr = load_record_v2(file_path)
    data = EstimatorPredictionViewerGosford(filename)

    correctness = []
    for entry in data:
        features = itr.__next__()

        input_ids = entry.get_vector("input_ids")
        input_ids2 = take(features["input_ids"])
        assert np.all(input_ids == input_ids2)
        label = take(features["label_ids"])[0]
        logits = entry.get_vector("logits")
        pred = np.argmax(logits)

        if pred == label:
            correctness.append(1)
        else:
            correctness.append(0)
    return correctness
Пример #18
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all")
    writer = RecordWriterWrap(output_path)
    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))
                writer.write_feature(new_features)
Пример #19
0
def generate_training_data(data_id):
    num_samples_list = open(
        os.path.join(working_path, "entry_prediction_n", data_id),
        "r").readlines()
    p = os.path.join(working_path, "entry_loss",
                     "entry{}.pickle".format(data_id))
    loss_outputs_list = pickle.load(open(p, "rb"))
    print("Loaded input data")
    loss_outputs = []
    for e in loss_outputs_list:
        loss_outputs.extend(e["masked_lm_example_loss"])
    print("Total of {} loss outputs".format(len(loss_outputs)))
    feature_itr = load_record_v2(
        os.path.join(working_path, "entry_prediction_tf.done", data_id))

    instance_idx = 0
    writer = tf.python_io.TFRecordWriter(
        os.path.join(working_path, "entry_prediction_train", data_id))

    n = len(num_samples_list)
    for i in range(n):
        n_sample = int(num_samples_list[i])
        assert n_sample > 0
        first_inst = feature_itr.__next__()

        if instance_idx + n_sample >= len(loss_outputs):
            break

        if n_sample == 1:
            continue

        no_dict_loss = loss_outputs[instance_idx]
        instance_idx += 1
        all_samples = []
        for j in range(1, n_sample):
            feature = feature_itr.__next__()
            loss = loss_outputs[instance_idx]
            if loss < no_dict_loss * 0.9:
                label = 1
            else:
                label = 0
            new_features = collections.OrderedDict()

            for key in feature:
                new_features[key] = btd.create_int_feature(take(feature[key]))

            new_features["useful_entry"] = btd.create_int_feature([label])

            example = tf.train.Example(features=tf.train.Features(
                feature=new_features))
            writer.write(example.SerializeToString())

    writer.close()
Пример #20
0
    def check_feature(feature):
        feature_d = {}
        for key in feature:
            v = take(feature[key])
            feature_d[key] = v

        input_ids = feature_d["input_ids"]
        alt_emb_mask = feature_d["alt_emb_mask"]

        for i in range(len(input_ids)):
            if alt_emb_mask[i] and input_ids[i] not in all_tokens:
                print(i, input_ids[i])
Пример #21
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        mapping = {0: 0, 1: 0, 2: 1}

        for key in feature:
            l = take(feature[key])
            if key == "segment_ids":
                l = list([mapping[v] for v in l])

            new_features[key] = create_int_feature(l)

        return new_features
Пример #22
0
def show_prediction(filename, file_path, correctness_1, correctness_2):

    data = EstimatorPredictionViewerGosford(filename)
    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(filename)
    html = HtmlVisualizer(name + ".html")
    idx = 0
    for entry in data:
        features = itr.__next__()

        input_ids = entry.get_vector("input_ids")
        input_ids2 = take(features["input_ids"])
        assert np.all(input_ids == input_ids2)
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]
        logits = entry.get_vector("logits")
        pred = np.argmax(logits)

        if not correctness_1[idx] or not correctness_2[idx]:
            html.write_paragraph("Label : {} Correct: {}/{}".format(
                label, correctness_1[idx], correctness_2[idx]))
            html.write_table([p_cells])
            html.write_table([h_cells])

        idx += 1
Пример #23
0
def transform(max_seq_length, feature):
    query_ids = take(feature["query_ids"])
    doc_ids = feature["doc_ids"].int64_list.value
    label_ids = feature["label"].int64_list.value[0]

    input_ids = list(query_ids) + list(doc_ids)
    segment_ids = [0] * len(query_ids) + [1] * len(doc_ids)
    input_mask = [1] * len(input_ids)
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    return input_ids, input_mask, segment_ids, label_ids
Пример #24
0
def build_word_tf(continuation_tokens: Set[int], file_path):
    feature_itr = load_record_v2(file_path)
    counter = Counter()
    for feature in feature_itr:
        if not is_real_example(feature):
            continue

        input_ids = take(feature["input_ids"])
        cur_word = []
        for idx, token_id in enumerate(input_ids):
            if token_id in continuation_tokens:
                cur_word.append(token_id)
            else:
                if len(cur_word) > 1:
                    word_sig = " ".join([str(t) for t in cur_word])
                    counter[word_sig] += 1
                cur_word = [token_id]

    return counter
Пример #25
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        success = False
        for key in feature:
            v = take(feature[key])
            if key == "input_ids":
                input_ids = v
                success, alt_emb_mask, alt_input_ids = get_alt_emb(input_ids)
                if not success and include_not_match:
                    assert len(input_ids) > 0
                    alt_emb_mask = [0] * len(input_ids)
                    alt_input_ids = [0] * len(input_ids)

                new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask)
                new_features["alt_input_ids"] = create_int_feature(
                    alt_input_ids)
            new_features[key] = create_int_feature(v)

        if success or include_not_match:
            return new_features
        else:
            return None
Пример #26
0
    def write_instance(self, instances, output_path):
        writer = RecordWriterWrap(output_path)
        for (inst_index, instance) in enumerate(instances):
            new_features = collections.OrderedDict()
            feature, contexts = instance
            for key in feature:
                v = take(feature[key])
                new_features[key] = create_int_feature(v[:self.max_seq_length])

            context_input_ids = []
            context_input_mask = []
            context_segment_ids = []

            for tokens in contexts:
                segment_ids = [0] * len(tokens)
                input_ids, input_mask, segment_ids = \
                    get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, tokens, segment_ids)
                context_input_ids.extend(input_ids)
                context_input_mask.extend(input_mask)
                context_segment_ids.extend(segment_ids)

            dummy_len = self.max_context - len(contexts)
            for _ in range(dummy_len):
                input_ids, input_mask, segment_ids = \
                    get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, [], [])
                context_input_ids.extend(input_ids)
                context_input_mask.extend(input_mask)
                context_segment_ids.extend(segment_ids)

            new_features["context_input_ids"] = create_int_feature(context_input_ids)
            new_features["context_input_mask"] = create_int_feature(context_input_mask)
            new_features["context_segment_ids"] = create_int_feature(context_segment_ids)
            writer.write_feature(new_features)
            if inst_index < 20:
                log_print_feature(new_features)
        writer.close()
Пример #27
0
def feature_to_ordered_dict(feature):
    new_features = collections.OrderedDict()
    for key in feature:
        new_features[key] = create_int_feature(take(feature[key]))
    return new_features
Пример #28
0
 def put(feature_name):
     return create_int_feature(take(feature[feature_name]))
Пример #29
0
 def condition_fn(features):
     return id_keyword in take(features['input_ids'])
Пример #30
0
def is_real_example(feature):
    if "is_real_example" not in feature:
        return True
    return take(feature["is_real_example"])[0] == 1