Exemplo n.º 1
0
def enum_dir_records(dir_path):
    file_path_list = get_dir_files(dir_path)

    while True:
        for file_path in file_path_list:
            for item in load_record(file_path):
                yield item
Exemplo n.º 2
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all_balanced")
    pos_insts = []
    neg_insts = []
    all_insts = [neg_insts, pos_insts]

    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))

                label = take(record['label_ids'])[0]
                all_insts[label].append(new_features)

    random.shuffle(pos_insts)
    random.shuffle(neg_insts)

    num_sel = min(len(pos_insts), len(neg_insts))
    print("{} insts per label".format(num_sel))

    insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel]
    writer = RecordWriterWrap(output_path)
    foreach(writer.write_feature, insts_to_write)
Exemplo n.º 3
0
def tfrecord_to_old_stype(tfrecord_path, feature_names: List):
    all_insts = []
    for feature in load_record(tfrecord_path):
        inst = []
        for key in feature_names:
            v = take(feature[key])
            inst.append(list(v))
        all_insts.append(inst)
    return all_insts
Exemplo n.º 4
0
def do_filtering(file_path, out_path, condition_fn, debug_call_back=None):
    writer = RecordWriterWrap(out_path)
    for item in load_record(file_path):
        features = feature_to_ordered_dict(item)
        if condition_fn(features):
            if debug_call_back is not None:
                debug_call_back(features)
            writer.write_feature(features)
    writer.close()
Exemplo n.º 5
0
def run(dir_path, save_dir):
    exist_or_mkdir(save_dir)
    for split in ["train", "dev"]:
        for idx, topic in enumerate(
                data_generator.argmining.ukp_header.all_topics):
            file_name = "{}_{}".format(split, topic)
            file_path = os.path.join(dir_path, file_name)
            save_path = os.path.join(save_dir, file_name)
            augment_topic_ids(load_record(file_path), save_path)
Exemplo n.º 6
0
def read(fn):
    examples = load_record(fn)
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    for feature in examples:
        print(inst2str(feature, tokenizer))
        print()
        print()
Exemplo n.º 7
0
 def work(self, job_id):
     tfrecord_path = os.path.join(self.input_dir, str(job_id))
     features = load_record(tfrecord_path)
     save_path = os.path.join(self.out_dir, str(job_id))
     writer = RecordWriterWrap(save_path)
     for f in collect_passages(features, self.relevance_scores,
                               self.cpid_to_label, self.num_max_para,
                               self.window_size):
         writer.write_feature(f)
     writer.close()
Exemplo n.º 8
0
    def work(self, job_id):
        tfrecord_path = os.path.join(self.input_dir, str(job_id))
        features = load_record(tfrecord_path)

        save_path = os.path.join(self.out_dir, str(job_id))
        writer = RecordWriterWrap(save_path)
        for f in rel_filter(features, self.relevance_scores,
                            self.cpid_to_label):
            writer.write_feature(f)
        writer.close()
Exemplo n.º 9
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all")
    writer = RecordWriterWrap(output_path)
    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))
                writer.write_feature(new_features)
Exemplo n.º 10
0
 def load(file_no):
     path = os.path.join(data_path, "pc_rel_tfrecord_dev", str(file_no))
     d = {}
     for feature in load_record(path):
         data_id = take(feature["data_id"])[0]
         input_ids = take(feature["input_ids"])
         segment_ids = take(feature["segment_ids"])
         d[data_id] = input_ids, segment_ids
         print(data_id)
     print("loaded {} data".format(len(d)))
     return d
    def work(self, job_id):
        tfrecord_path = os.path.join(self.input_dir, str(job_id))
        features = load_record(tfrecord_path)

        save_path = os.path.join(self.out_dir, str(job_id))
        all_entry = []
        for entry in rel_filter_to_para(features, self.relevance_scores,
                                        self.cpid_to_label):
            all_entry.append(entry)

        pickle.dump(all_entry, open(save_path, "wb"))
Exemplo n.º 12
0
def get_lm_tf(fn, sample_size=None, as_subword=True):
    tokenizer = get_tokenizer()
    tfrecord_itr = load_record(fn)

    lm = LM(as_subword, tokenizer)

    for idx, inst in enumerate(tfrecord_itr):
        if sample_size is not None and idx > sample_size:
            break
        input_ids = inst["input_ids"].int64_list.value
        lm.update(input_ids)

    return lm.tf
Exemplo n.º 13
0
def print_as_html(fn):
    examples = load_record(fn)
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    html_output = HtmlVisualizer("out_name.html")

    for feature in examples:
        masked_inputs = feature["input_ids"].int64_list.value
        idx = 0
        step = 512
        while idx < len(masked_inputs):
            slice = masked_inputs[idx:idx + step]
            tokens = tokenizer.convert_ids_to_tokens(slice)
            idx += step
            cells = cells_from_tokens(tokens)
            html_output.multirow_print(cells)
        html_output.write_paragraph("----------")
Exemplo n.º 14
0
    def work_inner(self, input_file_path, output_path):

        itr = load_record(input_file_path)
        writer = RecordWriterWrap(output_path)

        def reform_a_input(raw_input):
            return np.reshape(raw_input, [self.inner_batch_size, self.max_seq_length])

        def reform_mask_input(raw_input):
            return np.reshape(raw_input, [self.inner_batch_size, self.max_predictions_per_seq])

        def get_as_list(feature, name):
            ids = list(feature[name].int64_list.value)
            ids_list = reform_a_input(ids)
            return ids_list

        all_features = []
        for feature in itr:
            listed_inputs = {}
            for key in ["input_ids", "input_mask", "segment_ids"]:
                listed_inputs[key] = get_as_list(feature, key)

            for key in ["masked_lm_positions","masked_lm_ids"]:
                ids = list(feature[key].int64_list.value)
                listed_inputs[key] = reform_mask_input(ids)

            listed_inputs["masked_lm_weights"] = reform_mask_input(feature["masked_lm_weights"].float_list.value)

            for i in range(self.inner_batch_size):
                new_features = collections.OrderedDict()
                for key, value in listed_inputs.items():
                    if key is "masked_lm_weights":
                        new_features[key] = create_float_feature(value[i])
                    else:
                        new_features[key] = create_int_feature(value[i])
                all_features.append(new_features)

        random.shuffle(all_features)
        for f in all_features:
            writer.write_feature(f)
        writer.close()
Exemplo n.º 15
0
def sample_median():
    # we don't want to make one of (bad/good) split to have shorter text than the other.
    all_scores = []
    scorer = get_lm_scorer()

    files = get_dir_files(tf_record_dir)
    random.shuffle(files)

    for file_path in files[:10]:
        tfrecord_itr = load_record(file_path)
        ticker = TimeEstimator(1000)
        for idx, inst in enumerate(tfrecord_itr):
            all_scores.append(scorer(inst))
            if idx > 1000:
                break
            ticker.tick()
    all_scores.sort()
    l = len(all_scores)
    print(l)
    mid = int(l / 2)
    print(all_scores[mid])
Exemplo n.º 16
0
def get_iterator():
    return load_record(
        os.path.join(data_path, "nli", "bert_code_train.tf_record"))
Exemplo n.º 17
0
def translate(tfrecord_path, st, ed):
    max_seq_length = 512
    transform_fn = partial(transform, max_seq_length)
    itr = slice_iterator(load_record(tfrecord_path), st, ed)
    for entry in itr:
        yield transform_fn(entry)
Exemplo n.º 18
0
def load_tfrecord(record_path):
    for feature in load_record(record_path):
        input_ids = feature["input_ids"].int64_list.value
        label_ids = feature["label_ids"].int64_list.value[0]
        yield input_ids, label_ids
Exemplo n.º 19
0
    def create_instances(self, input_path, target_topic, target_seq_length):
        tokenizer = get_tokenizer()
        doc_top_k = 1000

        all_train_data = list(load_record(input_path))
        train_data = []
        for feature in all_train_data:
            input_ids = feature["input_ids"].int64_list.value
            token_id = input_ids[1]
            topic = token_ids_to_topic[token_id]
            if target_topic == topic:
                train_data.append(feature)

        print("Selected {} from {}".format(len(train_data), len(all_train_data)))

        doc_dict = load_tokens_for_topic(target_topic)
        token_doc_list = []
        ranked_list = sydney_get_ukp_ranked_list()[target_topic]
        print("Ranked list contains {} docs, selecting top-{}".format(len(ranked_list), doc_top_k))
        doc_ids = [doc_id for doc_id, _, _ in ranked_list[:doc_top_k]]

        for doc_id in doc_ids:
            doc = doc_dict[doc_id]
            token_doc = pool_tokens(doc, target_seq_length)
            token_doc_list.extend(token_doc)

        ranker = Ranker()
        target_tf_list = lmap(ranker.get_terms, token_doc_list)

        ranker.init_df_from_tf_list(target_tf_list)

        inv_index = collections.defaultdict(list)
        for doc_idx, doc_tf in enumerate(target_tf_list):
            for term in doc_tf:
                if ranker.df[term] < ranker.N * 0.3:
                    inv_index[term].append(doc_idx)


        def get_candidate_from_inv_index(inv_index, terms):
            s = set()
            for t in terms:
                s.update(inv_index[t])
            return s

        source_tf_list = []
        selected_context = []
        for s_idx, feature in enumerate(train_data):
            input_ids = feature["input_ids"].int64_list.value
            topic_seg, sent = split_p_h_with_input_ids(input_ids, input_ids)
            source_tf = ranker.get_terms_from_ids(sent)
            source_tf_list.append(source_tf)
            ranked_list = []
            candidate_docs = get_candidate_from_inv_index(inv_index, source_tf.keys())
            for doc_idx in candidate_docs:
                target_tf = target_tf_list[doc_idx]
                score = ranker.bm25(source_tf, target_tf)
                ranked_list.append((doc_idx, score, target_tf))
            ranked_list.sort(key=lambda x: x[1], reverse=True)
            ranked_list = list(filter_overlap(ranked_list))
            ranked_list = ranked_list[:self.max_context]

            if s_idx < 10:
                print("--- Source sentence : \n", pretty_tokens(tokenizer.convert_ids_to_tokens(sent), True))
                print("-------------------")
                for rank, (idx, score, target_tf) in enumerate(ranked_list):
                    ranker.bm25(source_tf, target_tf, True)
                    print("Rank#{}  {} : ".format(rank, score) + pretty_tokens(token_doc_list[idx], True))
            if s_idx % 100 == 0:
                print(s_idx)
            contexts = list([token_doc_list[idx] for idx, score, _ in ranked_list])
            selected_context.append(contexts)

        for sent_idx, feature in enumerate(train_data):
            contexts = selected_context[sent_idx]
            yield feature, contexts
Exemplo n.º 20
0
def run(dir_path, save_dir):
    for idx, topic in enumerate(
            data_generator.argmining.ukp_header.all_topics):
        file_path = os.path.join(dir_path, topic)
        save_path = os.path.join(save_dir, topic)
        augment_topic_ids(load_record(file_path), idx, save_path)
Exemplo n.º 21
0
def show_feature_text(tfrecord_path, output_file_name):
    html = HtmlVisualizer(output_file_name)
    tokenizer = get_tokenizer()

    for feature in load_record(tfrecord_path):
        write_feature_to_html(feature, html, tokenizer)
Exemplo n.º 22
0
 def itr():
     for file in get_dir_files(path):
         for item in load_record(file):
             yield item