예제 #1
0
def main():
    dir_path = sys.argv[1]
    tokenizer = get_tokenizer()
    averager = Averager()
    sbc = SubwordConvertor()
    df = Counter()
    collection_size = 0
    tikcer = TimeEstimator(485393)
    for file_path in get_dir_files(dir_path):
        for idx, record in enumerate(
                tf.compat.v1.python_io.tf_record_iterator(file_path)):
            example = tf.train.Example()
            example.ParseFromString(record)
            feature = example.features.feature
            input_ids = feature["input_ids"].int64_list.value
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            sep_idx1 = tokens.index("[SEP]")
            sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1)
            doc_tokens = tokens[sep_idx1:sep_idx2]
            words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens))
            dl = len(words)
            collection_size += dl
            averager.append(dl)
            for word in set(words):
                df[word] += 1
            tikcer.tick()

    print("collection length", collection_size)
    print("average dl", averager.get_average())
    save_to_pickle(df, "subword_df_robust_train")
예제 #2
0
def get_candidate_all_passage_w_samping_predict(
        max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_predict(4)
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in queries:
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:100]
        doc_ids = list([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d
예제 #3
0
파일: tf_debug.py 프로젝트: clover3/Chair
def main():
    tokens = ["hi", "hello"]
    seg_ids = [
        0,
        0,
    ]
    inst = ClassificationInstance(tokens, seg_ids, 0)

    inst_list = [inst]

    out_path = "/tmp/temp.youngwoo"
    max_seq_length = 512
    tokenizer = get_tokenizer()
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    def encode_fn(inst: ClassificationInstance) -> OrderedDict:
        return encode_classification_instance(tokenizer, max_seq_length, inst)

    features_list: Iterable[OrderedDict] = map(encode_fn, inst_list)
    writer = tf.python_io.TFRecordWriter(out_path)
    for e in features_list:
        # features = OrderedDict()
        # features["input_ids"] = create_int_feature(input_ids)

        f = tf.train.Features(feature=e)
        tf_example = tf.train.Example(features=f)
        writer.write(tf_example.SerializeToString())
예제 #4
0
def main():
    info_path = os.path.join(job_man_dir, "MMD_pred_info", "1.info")
    info = load_combine_info_jsons(info_path)
    tokenizer = get_tokenizer()
    cnt = 0
    fn = os.path.join(job_man_dir, "MMD_pred", "1")

    for record in tf.compat.v1.python_io.tf_record_iterator(fn):
        example = tf.train.Example()
        example.ParseFromString(record)
        feature = example.features.feature
        keys = feature.keys()

        print("---- record -----")
        v = feature["input_ids"].int64_list.value
        data_id = feature["data_id"].int64_list.value[0]
        info_entry = info[str(data_id)]
        passage_idx = info_entry['passage_idx']
        tokens = tokenizer.convert_ids_to_tokens(v)
        text = " ".join(tokens)
        sep_idx = text.find("[SEP]")
        print(passage_idx)
        print(text[:sep_idx])
        print(text[sep_idx:])

        cnt += 1
        if cnt >= 10:
            break
예제 #5
0
def write_records(records: List[Record], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def encode(record: Record) -> OrderedDict:
        tokens = ["[CLS]"] + record.claim_tokens + [
            "[SEP]"
        ] + record.doc_tokens + ["[SEP]"]
        segment_ids = [0] * (len(record.claim_tokens) + 2) \
                      + [1] * (len(record.doc_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)

        labels = [0.] * (len(record.claim_tokens) + 2) + record.scores
        labels += (max_seq_length - len(labels)) * [0.]
        label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask
        label_mask += (max_seq_length - len(label_mask)) * [0]
        features['label_ids'] = create_float_feature(labels)
        features['label_masks'] = create_int_feature(label_mask)
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
예제 #6
0
def make_training_data(config):
    pos_doc_list_path = config['doc_list_path']
    q_res_path = config['q_res_path']
    save_path = config['save_path']
    balance_test = config['balance_test']

    max_seq_length = 512

    pos_doc_ids = set(
        [l.strip() for l in open(pos_doc_list_path, "r").readlines()])
    doc_ids_unique = get_doc_ids_from_ranked_list_path(q_res_path)

    insts = generate(list(pos_doc_ids), list(doc_ids_unique), max_seq_length)

    train_size = int(0.9 * len(insts))
    train_insts = insts[:train_size]
    val_insts = insts[train_size:]

    val_pos_insts = list([i for i in val_insts if i.label == 1])
    val_neg_insts = list([i for i in val_insts if not i.label])
    print("num pos inst in val", len(val_pos_insts))
    if balance_test:
        val_neg_insts = val_neg_insts[:len(val_pos_insts)]
    val_insts = val_pos_insts + val_neg_insts

    tokenizer = get_tokenizer()

    def encode_fn(inst: Instance) -> OrderedDict:
        return encode_w_data_id(tokenizer, max_seq_length, inst)

    write_records_w_encode_fn(save_path + "train", encode_fn, train_insts)
    write_records_w_encode_fn(save_path + "val", encode_fn, val_insts)
예제 #7
0
def main():
    dir_path = sys.argv[1]
    tokenizer = get_tokenizer()
    averager = Averager()

    for file_path in get_dir_files(dir_path):
        for idx, record in enumerate(
                tf.compat.v1.python_io.tf_record_iterator(file_path)):
            if idx % 3:
                continue
            example = tf.train.Example()
            example.ParseFromString(record)
            feature = example.features.feature
            input_mask = feature["input_mask"].int64_list.value
            if input_mask[-1]:
                input_ids = feature["input_ids"].int64_list.value
                tokens = tokenizer.convert_ids_to_tokens(input_ids)
                sep_idx1 = tokens.index("[SEP]")
                sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1)
                doc_tokens = tokens[sep_idx1:sep_idx2]
                continue_cnt = 0
                for t in doc_tokens:
                    if t[:2] == "##":
                        continue_cnt += 1


##
                n_words = len(doc_tokens) - continue_cnt
                averager.append(n_words)

    print("average", averager.get_average())
예제 #8
0
def modify_data_loader(data_loader):
    tokenizer = get_tokenizer()
    CLS_ID = tokenizer.convert_tokens_to_ids(["[CLS]"])[0]
    SEP_ID = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
    data_loader.CLS_ID = CLS_ID
    data_loader.SEP_ID = SEP_ID
    return data_loader
예제 #9
0
def file_show(fn):
    cnt = 0
    tokenizer = get_tokenizer()
    for record in tf.compat.v1.python_io.tf_record_iterator(fn):
        example = tf.train.Example()
        example.ParseFromString(record)
        feature = example.features.feature
        keys = feature.keys()

        print("---- record -----")
        for key in keys:
            if key == "masked_lm_weights":
                v = feature[key].float_list.value
            else:
                v = feature[key].int64_list.value

            print(key)
            print(v)

            if key in ["input_ids", "input_ids1", "input_ids2"]:
                tokens = tokenizer.convert_ids_to_tokens(v)
                print(key)
                print(" ".join(tokens))

        cnt += 1
        if cnt >= 5:
            break
예제 #10
0
    def __init__(self, topic, ranked_list_path, token_file_path):
        ranked_list_d = load_galago_ranked_list(ranked_list_path)
        self.ranked_list = ranked_list_d["unk-0"]

        self.tokenizer = get_tokenizer()
        self.topic = topic
        self.tokens = pickle.load(open(token_file_path, "rb"))
        self.doc_idx = 0
예제 #11
0
def write_records(records: List[Payload], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def encode(inst: Payload) -> OrderedDict:
        inst_2 = convert_sub_token(tokenizer, inst)
        return encode_inner(max_seq_length, tokenizer, inst_2)

    write_records_w_encode_fn(output_path, encode, records)
예제 #12
0
def get_recover_subtokens():
    tokenizer = get_tokenizer()

    def recover_subtokens(input_ids) -> List[str]:
        tokens1, tokens2 = split_p_h_with_input_ids(input_ids, input_ids)
        return tokenizer.convert_ids_to_tokens(tokens2)

    return recover_subtokens
예제 #13
0
 def __init__(self):
     self.continuation = set()
     tokenizer = get_tokenizer()
     self.inv_vocab = tokenizer.inv_vocab
     assert tokenizer is not None
     for token_id, subword in tokenizer.inv_vocab.items():
         if subword[:2] == "##":
             self.continuation.add(token_id)
예제 #14
0
 def __init__(self, bm25_module, max_seq_length, include_title=False):
     self.max_seq_length = max_seq_length
     self.bm25_module = bm25_module
     pc_tokenize = PCTokenizer()
     self.tokenize_stem = pc_tokenize.tokenize_stem
     self.include_title = include_title
     bert_tokenizer = get_tokenizer()
     self.bert_tokenize = bert_tokenizer.tokenize
예제 #15
0
def get_continuation_token_ids() -> Set[int]:
    tokenizer = get_tokenizer()

    s = set()
    for token, token_id in tokenizer.vocab.items():
        if token[:2] == "##":
            s.add(token_id)
    return s
예제 #16
0
 def __init__(self,
              candidates_dict: Dict[str, List[QCKCandidate]],
              is_correct_fn: Callable[[QCKQuery, QCKCandidate], bool],
              ):
     self.max_seq_length = 512
     self.tokenizer = get_tokenizer()
     self.candidates_dict: Dict[str, List[QCKCandidate]] = candidates_dict
     self._is_correct = is_correct_fn
예제 #17
0
 def __init__(self, encoder, max_seq_length, top_k=100, query_type="title"):
     self.data = self.load_tokens_from_pickles()
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.galago_rank = load_bm25_best()
     self.top_k = top_k
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
예제 #18
0
 def __init__(self, resource: ProcessedResourceI, max_seq_length,
              max_seg_per_doc):
     self.resource = resource
     self.tokenizer = get_tokenizer()
     self.title_token_max = 64
     self.query_token_max = 64
     self.max_seq_length = max_seq_length
     self.max_seg_per_doc = max_seg_per_doc
예제 #19
0
 def __init__(self, file_path, fetch_data_list=None):
     self.vectors, self.keys, self.data_len = self.estimator_prediction_loader(
         file_path, fetch_data_list)
     self.tokenizer = get_tokenizer()
     self.method_list = list([
         func for func in dir(EstimatorPredictionViewer)
         if callable(getattr(EstimatorPredictionViewer, func))
     ])
예제 #20
0
 def __init__(
     self,
     cid_to_passages: Dict[int, List[Tuple[List[str], float]]],
     candidate_perspective: Dict[int, List[int]],
 ):
     self.gold = get_claim_perspective_id_dict()
     self.candidate_perspective = candidate_perspective
     self.cid_to_passages = cid_to_passages
     self.tokenizer = get_tokenizer()
예제 #21
0
def translate_word_tf_to_subword_tf(word_tf):
    tokenizer = get_tokenizer()

    out = Counter()
    for word in word_tf:
        sub_words = tokenizer.tokenize(word)
        for sw in sub_words:
            out[sw] += word_tf[word]
    return out
예제 #22
0
    def __init__(self, split):
        query_group: List[List[QueryID]] = load_query_group(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.qrel = qrel
예제 #23
0
def get_statistic_for_join(join_result: Iterable[Tuple[str, MSMarcoDoc, JoinedPassage]]):
    print("get_statistic_for_join()")
    tokenizer = get_tokenizer()

    def size_in_tokens(text):
        return len(tokenizer.tokenize(text))

    intervals = list(range(0, 500, 50)) + list(range(500, 5000, 500))
    last = "5000 <"
    keys = intervals + [last]
    def bin_fn(n):
        for ceil in intervals:
            if n < ceil:
                return ceil

        return "5000 <"

    bin_doc = BinHistogram(bin_fn)
    bin_loc = BinHistogram(bin_fn)
    bin_passage = BinHistogram(bin_fn)

    match_fail = 0
    for doc, passage in join_result:
        if passage.loc >= 0:
            prev = doc.body[:passage.loc]
            n_tokens_before = len(tokenizer.tokenize(prev))
            passage_text = passage.text
            passage_len = len(passage_text)
            # print("passage loc", passage_loc)
            # print(n_tokens_before)
            bin_doc.add(size_in_tokens(doc.body))
            bin_loc.add(size_in_tokens(prev))
            bin_passage.add(size_in_tokens(passage_text))

            # print(prev)
            # print("  >>>>>  ")
            # print(passage_maybe)
            # print("   <<<< ")
            # print(next)
            pass
        else:
            match_fail += 1
            # print("passage not found in doc")
            # print(doc.body)

    print('match fail', match_fail)
    print("doc length")
    bins = [bin_doc, bin_passage, bin_loc]
    head = ['', 'bin_doc', 'bin_passage', 'bin_loc']
    rows = [head]
    for key in keys:
        row = [key]
        for bin in bins:
            row.append(bin.counter[key])
        rows.append(row)

    print_table(rows)
예제 #24
0
def select_word_from_dev():
    tokenizer = get_tokenizer()

    tf_dev = load_from_pickle("nli_tf_dev_mis")
    selected_words = select_common(tf_dev, tokenizer)

    print(list(tf_dev.most_common(100))[-1])

    save_to_pickle(selected_words, "nli_dev_selected_words")
예제 #25
0
파일: qcknc_mix.py 프로젝트: clover3/Chair
 def __init__(self,
              candidates_dict: Dict[str, List[QCKCandidateI]],
              is_correct_fn,
              kdp_as_sub_token=False):
     self.max_seq_length = 512
     self.tokenizer = get_tokenizer()
     self.candidates_dict: Dict[str, List[QCKCandidateI]] = candidates_dict
     self._is_correct = is_correct_fn
     self.kdp_as_sub_token = kdp_as_sub_token
예제 #26
0
def baseline_bert_gen_unbal_resplit(outpath, split):
    tokenizer = get_tokenizer()
    data: List[PerspectiveCandidate] = load_data_point_50_train_val(split)
    max_seq_length = 512

    writer = RecordWriterWrap(outpath)
    for entry in data:
        writer.write_feature(enc_to_feature(tokenizer, max_seq_length, entry))
    writer.close()
예제 #27
0
 def __init__(self, split, q_config_id, out_dir):
     self.out_dir = out_dir
     self.ci = StaticRankedListInterface(q_config_id)
     print("load__data_point")
     self.all_data_points = load_data_point(split)
     print("Load term stat")
     _, clue12_13_df = load_clueweb12_B13_termstat()
     self.clue12_13_df = clue12_13_df
     self.tokenizer = get_tokenizer()
예제 #28
0
 def __init__(self, encoder, max_seq_length, query_type="title"):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
예제 #29
0
 def __init__(self, encoder, max_seq_length):
     self.data = load_robust_tokens_for_train()
     assert len(self.data) == 174787
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust04_title_query()
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
예제 #30
0
파일: tune.py 프로젝트: clover3/Chair
 def __init__(self, prcessed_resource: ProcessedResourceI,
              get_tokens_d_bert, get_tokens_d_bm25, parallel_encoder,
              max_seq_length):
     self.prcessed_resource = prcessed_resource
     self.get_tokens_d_bert = get_tokens_d_bert
     self.get_tokens_d_bm25 = get_tokens_d_bm25
     self.encoder = parallel_encoder
     self.tokenizer = get_tokenizer()
     self.max_seq_length = max_seq_length