Exemplo n.º 1
0
def format_paragraph_features(
        tokenizer: FullTokenizer, max_seq_length: int,
        para_feature: ParagraphFeature) -> List[OrderedDict]:
    text1 = para_feature.datapoint.text1
    tokens1 = tokenizer.tokenize(text1)
    text2 = para_feature.datapoint.text2
    tokens2 = tokenizer.tokenize(text2)
    label: int = int(para_feature.datapoint.label)

    def encode(score_paragraph: ScoreParagraph) -> OrderedDict:
        para_tokens: List[Subword] = score_paragraph.paragraph.subword_tokens

        tokens = tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"
                                                  ] + para_tokens + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 1) + [1] * (
            len(tokens2) + 1) + [2] * (len(para_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)
        features['label_ids'] = create_int_feature([label])
        return features

    features: List[OrderedDict] = lmap(encode, para_feature.feature)
    return features
Exemplo n.º 2
0
def get_cpids_and_token_keys(
        tokenizer: FullTokenizer,
        claim_entry: ParagraphClaimPersFeature) -> Tuple[str, CPID]:
    claim_text = claim_entry.claim_pers.claim_text
    claim_tokens = tokenizer.tokenize(claim_text)
    p_text = claim_entry.claim_pers.p_text
    p_tokens = tokenizer.tokenize(p_text)
    key = " ".join(claim_tokens) + "_" + " ".join(p_tokens)
    cpid: CPID = CPID("{}_{}".format(claim_entry.claim_pers.cid,
                                     claim_entry.claim_pers.pid))
    return key, cpid
Exemplo n.º 3
0
def get_cpids_and_token_keys(
        tokenizer: FullTokenizer,
        para_feature: ParagraphFeature) -> Tuple[str, DPID]:
    text1 = para_feature.datapoint.text1
    tokens1 = tokenizer.tokenize(text1)
    text2 = para_feature.datapoint.text2
    tokens2 = tokenizer.tokenize(text2)

    key = " ".join(tokens1[1:]) + "_" + " ".join(tokens2)
    dpid: DPID = para_feature.datapoint.id
    return key, dpid
Exemplo n.º 4
0
def to_retrieval_format(tokenizer: FullTokenizer,
                        max_seq_length: int,
                        data_id_gen: DataIDGen,
                        f: ParagraphClaimPersFeature,
                        ) -> Tuple[Dict, List[OrderedDict]]:

    info_list = {}

    def get_feature(tokens1, tokens2, info):
        data_id = data_id_gen.new_id()
        info_list[data_id] = info
        tokens = tokens1 + tokens2
        segment_ids = [0] * len(tokens1) + [1] * len(tokens2)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer,
                                           max_seq_length,
                                           tokens,
                                           segment_ids)
        features['label_ids'] = create_int_feature([0])
        features['data_id'] = create_int_feature([data_id])
        return features

    ordered_dict_list = []
    for scored_paragraph in f.feature:
        tokens2 = scored_paragraph.paragraph.subword_tokens
        claim_tokens = tokenizer.tokenize(f.claim_pers.claim_text)
        p_tokens = tokenizer.tokenize(f.claim_pers.p_text)
        data_info_c = {
            'cid': f.claim_pers.cid,
        }
        out_f = get_feature(claim_tokens, tokens2, data_info_c)
        ordered_dict_list.append(out_f)

        data_info_p = {
            'pid': f.claim_pers.pid
        }
        out_f = get_feature(p_tokens, tokens2, data_info_p)
        ordered_dict_list.append(out_f)

    return info_list, ordered_dict_list
Exemplo n.º 5
0
def get_biobert_tokenizer():
    return FullTokenizer(get_biobert_voca_path())
Exemplo n.º 6
0
    def __init__(self, out_path):
        self.out_dir = "/mnt/nfs/work3/youngwookim/data/clueweb12-B13_tokens"
        voca_path = os.path.join(data_path, "bert_voca.txt")
        self.tokenizer = FullTokenizer(voca_path, True)

        self.file_list = load_undone_file_list()
Exemplo n.º 7
0
import os
import sys

import datastore.tool
from cpath import data_path
from data_generator.tokenizer_wo_tf import FullTokenizer
from galagos.doc_processor import process_jsonl


def all_pipeline(jsonl_path, tokenize_fn):
    #  Read jsonl
    f = open(jsonl_path, "r")
    line_itr = f
    buffered_saver = datastore.tool.BufferedSaver()
    process_jsonl(line_itr, tokenize_fn, buffered_saver)
    buffered_saver.flush()


if __name__ == "__main__":
    jsonl_path = sys.argv[1]
    if len(sys.argv) == 3:
        voca_path = sys.argv[2]
    else:
        voca_path = os.path.join(data_path, "bert_voca.txt")

    tokenize_fn = FullTokenizer(voca_path, True).tokenize
    all_pipeline(jsonl_path, tokenize_fn)
Exemplo n.º 8
0
 def __init__(self, out_path_not_used):
     voca_path = os.path.join(data_path, "bert_voca.txt")
     self.tokenize_fn = FullTokenizer(voca_path, True).tokenize
     self.jsonl_path_format = "/mnt/nfs/work3/youngwookim/data/perspective/train_claim_perspective/doc_jsonl/{}.jsonl"
Exemplo n.º 9
0
 def __init__(self, jsonl_path, out_dir):
     voca_path = os.path.join(data_path, "bert_voca.txt")
     self.tokenize_fn = FullTokenizer(voca_path, True).tokenize
     self.jsonl_path = jsonl_path
     self.out_dir = out_dir
     exist_or_mkdir(out_dir)
Exemplo n.º 10
0
 def __init__(self, jsonl_path, out_path_not_used):
     voca_path = os.path.join(data_path, "bert_voca.txt")
     self.tokenize_fn = FullTokenizer(voca_path, True).tokenize
     self.jsonl_path = jsonl_path
Exemplo n.º 11
0
def get_tokenizer():
    voca_path = os.path.join(data_path, "bert_voca.txt")
    return FullTokenizer(voca_path)