Exemplo n.º 1
0
 def initializer(self):
     global bpe
     bpe = get_encoder(
         os.path.join(os.path.join(self.roberta_dir, 'gpt2_bpe', 'encoder.json')),
         os.path.join(os.path.join(self.roberta_dir, 'gpt2_bpe', 'vocab.bpe')),
     )
     global vocab
     vocab = Dictionary.load(os.path.join(self.roberta_dir, 'roberta.base', 'dict.txt'))
Exemplo n.º 2
0
 def initializer(self):
     global bpe
     bpe = get_encoder(
         os.path.join(self.roberta_dir, 'encoder.json'),
         os.path.join(self.roberta_dir, 'vocab.bpe'),
     )
     global vocab
     vocab = Dictionary.load(os.path.join(self.roberta_dir, 'dict.txt'))
     global entities
     if self.entity_vocab is not None:
         entities = load_entities(self.entity_vocab)
Exemplo n.º 3
0
def create_ent_augmented_target(source_file,
                                target_file,
                                out_text_file,
                                out_bpe_file,
                                tokenizer_dir,
                                special_token=50009,
                                max_len=1024):
    n_s = count_lines_in_text_file(source_file)
    n_t = count_lines_in_text_file(target_file)
    assert n_s == n_t, \
        "Number of lines not consistent: {}, {}".format(n_s, n_t)

    nlp = spacy.load("en_core_web_lg")

    encoder_args = SimpleNamespace(
        encoder_json=os.path.join(tokenizer_dir, "encoder.json"),
        vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"),
        keep_empty=True)
    bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe)

    with open(source_file, 'r') as s_f, \
        open(target_file, 'r') as t_f, \
        open(out_bpe_file, 'w') as out_bpe_f, \
        open(out_text_file, 'w') as out_text_f:

        for _ in tqdm(range(n_s)):
            sline = s_f.readline().strip()
            tline = t_f.readline().strip()

            doc = nlp(tline)
            entities_per_example = []
            for e in doc.ents:
                if e[0].ent_type_ in TRACKING_ENTITY_LIST:
                    # if e.text in source:
                    match_result = entity_match(e.text, sline, 2)
                    if match_result:
                        entities_per_example.append(match_result[0])
            target_bpe = bpe.encode(tline)
            if entities_per_example:
                entity_bpe = bpe.encode(", ".join(entities_per_example))
                augmented_target_bpe = entity_bpe + [
                    special_token,
                ] + target_bpe
            else:
                augmented_target_bpe = [
                    special_token,
                ] + target_bpe
            out_text_f.write("{}".format(entities_per_example) + '\n')
            out_bpe_f.write(
                ' '.join(map(str, augmented_target_bpe[:max_len - 1])) + '\n')
Exemplo n.º 4
0
def create_ent_labels(source_file,
                      target_file,
                      out_file,
                      tokenizer_dir,
                      first_only=False):
    n_s = count_lines_in_text_file(source_file)
    n_t = count_lines_in_text_file(target_file)
    assert n_s == n_t, \
        "Number of lines not consistent: {}, {}".format(n_s, n_t)

    nlp = spacy.load("en_core_web_lg")
    entities_found = []

    encoder_args = SimpleNamespace(
        encoder_json=os.path.join(tokenizer_dir, "encoder.json"),
        vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"),
        keep_empty=True)
    bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe)

    with open(source_file, 'r') as s_f, \
        open(target_file, 'r') as t_f, \
        open(out_file, 'w') as out_f:

        for _ in tqdm(range(n_s)):
            sline = s_f.readline().strip()
            tline = t_f.readline().strip()
            tokens = bpe.encode(sline)
            labels = [0] * len(tokens)

            doc = nlp(tline)
            entities_per_example = []
            for e in doc.ents:
                if e[0].ent_type_ in TRACKING_ENTITY_LIST:
                    entity_new = {'text': e.text, 'type': e[0].ent_type_}
                    # if e.text in source:
                    match_result = entity_match(e.text, sline, 2)
                    entity_new['match_result'] = match_result
                    labels = update_bio_labels(labels,
                                               sline,
                                               match_result,
                                               tokens,
                                               bpe,
                                               first_only=first_only)
                    entities_per_example.append(entity_new)
            out_f.write(" ".join([str(i) for i in labels]) + '\n')
            entities_found.append(entities_per_example)
    return entities_found
Exemplo n.º 5
0
def sanity_check(entities, source_bpe_file, label_file, eval_file,
                 tokenizer_dir):
    n_s = count_lines_in_text_file(source_bpe_file)
    n_l = count_lines_in_text_file(label_file)

    assert n_s == n_l == len(entities), \
        "Number of lines not consistent: {}, {}, {}, {}".format(n_s, n_l, len(entities))
    encoder_args = SimpleNamespace(
        encoder_json=os.path.join(tokenizer_dir, "encoder.json"),
        vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"),
        keep_empty=True)
    bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe)

    with open(source_bpe_file, 'r') as s_f, \
        open(label_file, 'r') as l_f, \
        open(eval_file, 'w') as o_f:
        for i in tqdm(range(n_l)):
            sline = s_f.readline().strip()
            tokens = [int(t) for t in sline.split()]
            lline = l_f.readline().strip()
            labels = [int(t) for t in lline.split()]
            assert len(tokens) == len(
                labels), "Number of source tokens must equal that of labels!"
            entities_per_example = entities[i]
            ent_text = ""
            for e in entities_per_example:
                ent_text += e['text']
                ent_text += str(e['match_result'])
                ent_text += ", "
            spans = extract_ent_from_labels(tokens, labels)
            ent_text += "FROM LABELS==>"
            for span in spans:
                ent_text += bpe.decode(span).strip()
                ent_text += ', '
            ent_text += '\n'
            o_f.write(ent_text)
 def initializer(self):
     global bpe
     bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe)
def main(json_folder_dir, debug_mode, dataset_name, percentage,
         encoder_json_file_path, vocab_bpe_path):

    if dataset_name == "All":
        dataset_name_list = [
            'biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset',
            'pmc_custom_license'
        ]
    else:
        dataset_name_list = [dataset_name]

    ## take text out of json
    txt_dir = "raw_txt_data"
    json_text(json_folder_dir, dataset_name_list, percentage, txt_dir)

    ## BPE on text
    ## To encoder all input arguments here.
    # encoder_json_file_path = 'gpt2_bpe/encoder.json'
    # vocab_bpe_path = 'gpt2_bpe/vocab.bpe'
    bpe = get_encoder(encoder_json_file_path, vocab_bpe_path)

    def encode(line):
        # global bpe
        ids = bpe.encode(line)
        return list(map(str, ids))

    def decode(tokens):
        # global bpe
        return bpe.decode(tokens)

    bpe_dir = 'bpe_data'
    os.makedirs(bpe_dir, exist_ok=True)
    input_file_train = open(os.path.join(txt_dir, "train.txt"),
                            "r",
                            encoding="utf-8")
    input_file_val = open(os.path.join(txt_dir, "val.txt"),
                          "r",
                          encoding="utf-8")

    output_file_train = open(os.path.join(bpe_dir, "train.bpe"),
                             "w",
                             encoding="utf-8")
    output_file_val = open(os.path.join(bpe_dir, "val.bpe"),
                           "w",
                           encoding="utf-8")

    input_file_list = [input_file_train, input_file_val]
    output_file_list = [output_file_train, output_file_val]

    for handler_idx, output_handler in enumerate(output_file_list):
        output_text_files = [output_handler]
        enc_lines = []
        for idx, line in enumerate(input_file_list[handler_idx]):
            line = line.strip()
            tokens = encode(line)
            if len(tokens) == 0: continue
            enc_lines.append(" ".join(tokens))
            encoded_lines = [["PASS", enc_lines]]
            for i, (filt, enc_lines) in enumerate(encoded_lines, start=1):
                if filt == "PASS":
                    for enc_line, output_h in zip(enc_lines,
                                                  output_text_files):
                        print(enc_line, file=output_h)
    print("BPE has been generated.")
Exemplo n.º 8
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "sent" in self.config.n_model:
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            from nltk.tokenize import sent_tokenize

            return PororoSentTokenizer(sent_tokenize, self.config)

        if self.config.n_model == "mecab_ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabKoTokenizer(model, self.config)

        if self.config.n_model == "char":
            return PororoCharTokenizer(self.config)

        if self.config.n_model == "jamo":
            return PororoJamoTokenizer(self.config)

        if self.config.n_model == "word":
            return PororoWordTokenizer(self.config)

        if self.config.n_model == "roberta":
            from fairseq.data.encoders.gpt2_bpe import get_encoder

            encoder = download_or_load("misc/encoder.json", self.config.lang)
            vocab = download_or_load("misc/vocab.bpe", self.config.lang)
            model = get_encoder(encoder, vocab)

            with open(encoder, "r") as f_vocab:
                vocab = json.load(f_vocab)
                inv_dict = {v: k for k, v in vocab.items()}

            return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)

        if self.config.n_model == "moses":
            try:
                from sacremoses import MosesDetokenizer, MosesTokenizer
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install sacremoses with: `pip install sacremoses`")
            model = MosesTokenizer(lang="en")
            detok = MosesDetokenizer(lang="en")
            return PororoMosesTokenizer(model, detok, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            model = jieba.cut
            return PororoJiebaTokenizer(model, self.config)

        if self.config.n_model == "mecab":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabTokenizer(model, self.config)
        else:
            from pororo.tasks.utils.tokenizer import CustomTokenizer

            path = download_or_load(
                f"tokenizers/{self.config.n_model}.zip",
                self.config.lang,
            )

            ext = "json" if "unigram" not in self.config.n_model else "txt"
            merges_filename = (f"{path}/merges.txt" if "unigram"
                               not in self.config.n_model else None)

            model = CustomTokenizer.from_file(
                vocab_filename=f"{path}/vocab.{ext}",
                merges_filename=merges_filename,
                normalize=True if "jpe" not in self.config.n_model else False,
            )
            if "jpe" in self.config.n_model:
                return PororoJamoPairTokenizer(model, self.config)
            if "mecab.bpe" in self.config.n_model:
                return PororoMecabSPTokenizer(model, self.config)
            return PororoSPTokenizer(model, self.config)
 def __init__(self, encoder_json_path: str, vocab_bpe_path: str):
     self.processor = get_encoder(encoder_json_path, vocab_bpe_path)
Exemplo n.º 10
0
from fairseq.data.encoders.gpt2_bpe import get_encoder
encoder = get_encoder("/path/to/roberta.base/encoder.json",
                      "/path/to/roberta.base/vocab.bpe")
train_data = []
with open("dev.csv", "r", encoding="utf-8") as f:
    lines = f.readlines()
    tmp_sentence = []
    for idx, line in enumerate(lines):
        if idx == 0:
            continue
        if line.split(",")[0].startswith("Sentence: "):
            if len(tmp_sentence) > 0:
                train_data.append(tmp_sentence)
            tmp_sentence = []
        line = line.strip().split(",")
        if len(line) == 4:
            word = line[1]
            label = line[3]
        else:
            word = ","
            label = line[-1]
        ids = encoder.encode(
            word
        )  # in ner task, capitalized word are more likely to be predicted as ner word, so use encoder.encode(word.lower()) may help you in real world, although the accuracy may be lower
        for idx, _id in enumerate(ids):
            if label.startswith("B") and idx != 0:
                label = "I" + label[1:]
            tmp_sentence.append((str(_id), label))

with open("dev.text.txt.bpe", "w", encoding="utf-8") as ft, \
    open("dev.label.txt", "w", encoding="utf-8") as fl:
def preprecess_QA_generation_newsqa_squad(
        input_dir,
        output_dir,
        encoder_json="/home/ec2-user/fairseq/encoder.json",
        vocab_bpe="/home/ec2-user/fairseq/vocab.bpe",
        only_squad=False):
    # use '50009' for the special dictionary token to separate question and answers since
    # this token is not encountered in bpe outputs

    def _process_data(d, data_source, bpe, source_f, source_bpe_f, target_f,
                      target_bpe_f):
        if data_source == 'newsqa':
            source = d['text'].strip()
            for q in d['questions']:
                if 'consensus' in q and 'q' in q and 's' in q['consensus']:
                    question = q['q'].strip()
                    answer_s = q['consensus']['s']
                    answer_e = q['consensus']['e']
                    answer = source[answer_s:answer_e].strip()
                    truncated_source_bpe, truncated_source, question_answer_bpe = \
                        _format_question_answers_bpe(bpe, source, question, answer, special_token_id)

                    if truncated_source is None or answer_e >= len(
                            truncated_source
                    ):  # skip the question as answer span was truncated in source
                        continue
                    source_f.write(
                        truncated_source.encode(
                            'unicode-escape').decode().replace('\\\\', '\\') +
                        '\n')
                    source_bpe_f.write(
                        ' '.join(map(str, truncated_source_bpe)) + '\n')
                    target_f.write(bpe.decode(question_answer_bpe) + '\n')
                    target_bpe_f.write(
                        ' '.join(map(str, question_answer_bpe)) + '\n')
        elif data_source == 'squad':
            for paragraph in d['paragraphs']:
                context = paragraph['context']
                for qa in paragraph['qas']:
                    question = qa['question'].strip()
                    ans_set = set()
                    for ans in qa['answers']:
                        if ans['text'] not in ans_set:
                            ans_set.add(ans['text'])
                            truncated_source_bpe, truncated_source, question_answer_bpe = \
                                _format_question_answers_bpe(bpe, context, question, ans['text'], special_token_id)

                            if truncated_source is None:  # skip the question
                                continue
                            source_f.write(
                                truncated_source.encode('unicode-escape').
                                decode().replace('\\\\', '\\') + '\n')
                            source_bpe_f.write(
                                ' '.join(map(str, truncated_source_bpe)) +
                                '\n')
                            target_f.write(
                                bpe.decode(question_answer_bpe) + '\n')
                            target_bpe_f.write(
                                ' '.join(map(str, question_answer_bpe)) + '\n')
        else:
            raise Exception("data_source must be squad or newsqa!")

    special_token_id = 50009
    from fairseq.data.encoders.gpt2_bpe import get_encoder
    bpe = get_encoder(encoder_json, vocab_bpe)
    if not only_squad:
        input_json = os.path.join(input_dir, 'combined-newsqa-data-v1.json')
        with open(input_json, 'r') as f:
            newsqa = json.load(f)

    with open(os.path.join(output_dir, 'train.source'), 'w') as train_source_f, \
            open(os.path.join(output_dir, 'train.target'), 'w') as train_target_f, \
            open(os.path.join(output_dir, 'train.bpe.source'), 'w') as train_source_bpe_f, \
            open(os.path.join(output_dir, 'train.bpe.target'), 'w') as train_target_bpe_f, \
            open(os.path.join(output_dir, 'val.source'), 'w') as val_source_f, \
            open(os.path.join(output_dir, 'val.target'), 'w') as val_target_f, \
            open(os.path.join(output_dir, 'val.bpe.source'), 'w') as val_source_bpe_f, \
            open(os.path.join(output_dir, 'val.bpe.target'), 'w') as val_target_bpe_f, \
            open(os.path.join(output_dir, 'test.source'), 'w') as test_source_f, \
            open(os.path.join(output_dir, 'test.target'), 'w') as test_target_f, \
            open(os.path.join(output_dir, 'test.bpe.source'), 'w') as test_source_bpe_f, \
            open(os.path.join(output_dir, 'test.bpe.target'), 'w') as test_target_bpe_f:

        if not only_squad:
            for data in tqdm(newsqa['data']):
                if data['type'] == 'train':
                    _process_data(data, 'newsqa', bpe, train_source_f,
                                  train_source_bpe_f, train_target_f,
                                  train_target_bpe_f)
                elif data['type'] == 'dev':
                    _process_data(data, 'newsqa', bpe, val_source_f,
                                  val_source_bpe_f, val_target_f,
                                  val_target_bpe_f)
                elif data['type'] == 'test':
                    _process_data(data, 'newsqa', bpe, test_source_f,
                                  test_source_bpe_f, test_target_f,
                                  test_target_bpe_f)
                else:
                    print("data type error!")
                    print(data)
                    break

            print("Done with NewsQA!")

        print("Doing Squad now!")
        data_types = ["validation", "train"]
        for dtype in data_types:
            if dtype == "validation":
                input_file = "dev-v1.1.json"
            elif dtype == "train":
                input_file = "train-v1.1.json"
            else:
                print("ERROR! data split should be validation or train!")

            with open(os.path.join(input_dir, input_file), 'r') as f_in:
                data_dict = json.load(f_in)
            if dtype == "train":
                for data in tqdm(data_dict['data']):
                    _process_data(data, 'squad', bpe, train_source_f,
                                  train_source_bpe_f, train_target_f,
                                  train_target_bpe_f)
            elif dtype == "validation":
                for data in data_dict['data']:
                    _process_data(data, 'squad', bpe, val_source_f,
                                  val_source_bpe_f, val_target_f,
                                  val_target_bpe_f)
Exemplo n.º 12
0
 def __init__(self, dictionary, entity_dictionary, task=None):
     self.bpe = get_encoder(self.encoder_json, self.vocab_bpe)
     self.dictionary = dictionary
     self.entity_dictionary = entity_dictionary
     self.task = task