Exemplo n.º 1
0
def main(argv):
    input_dir = argv[1]
    output_dir = argv[2]
    tokenizer = BasicTokenizer()

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    for folder in os.listdir(input_dir):
        input_folder = os.path.join(input_dir, folder)
        output_folder = os.path.join(output_dir, folder)
        os.mkdir(output_folder)

        for text in os.listdir(input_folder):
            fin = open(os.path.join(input_folder, text), 'r', encoding='utf8')
            fout = open(os.path.join(output_folder, text),
                        'w',
                        encoding='utf8')
            for line in fin:
                if line.startswith('<doc') or line.startswith('</doc'):
                    continue

                line = line.strip()
                if not line:
                    continue

                for sentence in sentence_tokenize(line):
                    fout.write(" ".join(tokenizer.tokenize(sentence)) + '\n')
Exemplo n.º 2
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """
    Project the tokenized prediction back to the original text.
    """
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Exemplo n.º 3
0
 def tokenize_by_char(file):
     """
     按照字符切分句子
     :param file: txt文本
     :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル']
     """
     basic_tokenizer = BasicTokenizer()
     token_list = []
     with open(file, mode="r", encoding="utf-8") as fin:
         text = fin.read()
         str_list = basic_tokenizer.tokenize(text)
         for str in str_list:
             token_list += list(str)
     return token_list
Exemplo n.º 4
0
def s_split():
    files = _get_text_file()
    basic_tokenizer = BasicTokenizer(do_lower_case=args.do_lower_case)
    for file in files.split(","):
        with open(file + ".sent_splited",
                  'wt',
                  encoding='utf-8',
                  errors='ignore') as o:
            print("Processing {}".format(file))
            with open(file, 'rt', encoding='utf-8', errors='ignore') as f:
                for p in f:
                    #Do lower case if required
                    if len(p.strip()) == 0:
                        continue
                    doc_sentences = sent_tokenize(p)
                    #Output segmented sentences
                    for sent in doc_sentences:
                        o.write(' '.join(basic_tokenizer.tokenize(sent)) +
                                "\n")
                    o.write('\n')
 def tokenize_and_count(self, file):
     """
     按照字符切分句子,并统计词频
     :param file: txt文本
     :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル']
     """
     basic_tokenizer = BasicTokenizer(
         do_lower_case=False)  # 必须加上do_lower_case=False,这样じ才不会变成し
     token_list = []
     with open(file, mode="r", encoding="utf-8") as fin:
         text = fin.read()
         str_list = basic_tokenizer.tokenize(text)
         for str in str_list:
             token_list += list(str)
     # 统计
     tmp_count_dict = dict()
     for token in token_list:
         if token not in tmp_count_dict:
             tmp_count_dict[token] = 0
         tmp_count_dict[token] += 1
     self.q_count_dic.put(tmp_count_dict)  # 将统计结果放入序列
Exemplo n.º 6
0
def gen_data(in_file, out_file, tagType):
    with open(in_file, 'r', encoding='utf8') as f:
        raw_data = [_.strip() for _ in f.readlines()]

    vocab_file = '../models/vocab.txt'
    full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
    basic_tokenizer = BasicTokenizer(do_lower_case=True)

    data_all = [
        preprocess2dict(s, tagType, full_tokenizer, basic_tokenizer)
        for s in tqdm(raw_data)
    ]

    df = pd.DataFrame(data_all)
    # separate with \t
    df.to_csv(out_file, sep='\t', encoding='utf-8', index=False)

    print('Finish writing generated ' + tagType + ' data in ' + out_file)
Exemplo n.º 7
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info(
                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
                            orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Exemplo n.º 8
0
import os
import json
import argparse

import numpy as np
from tqdm import tqdm
from tokenization import whitespace_tokenize, BasicTokenizer


title_s = "<title>"
title_e = "</title>"
tokenizer = BasicTokenizer()

def save(data, dir_name, data_type):
    if not os.path.isdir(os.path.join('data', dir_name)):
        os.makedirs(os.path.join('data', dir_name))

    file_path = os.path.join('data', dir_name, '{}.json'.format(data_type))
    with open(file_path, 'w') as f:
        print ("Saving {}".format(file_path))
        json.dump({'data': data}, f)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='hotpotqa')
    parser.add_argument('--task', type=str, default="hotpot-all")
    args = parser.parse_args()

    if args.task == 'hotpot-all':
        training_data = load_hotpot(args, 'train')
        save(training_data, 'hotpot-all', 'train')
Exemplo n.º 9
0
import json

flags = tf.flags
FLAGS = flags.FLAGS

## Required parameters
flags.DEFINE_string("input_file", None,
                    "The input json file with a SQUAD like structure.")

flags.DEFINE_string("output_file", None,
                    "The ouput json file with AraBERT preprocessing applied.")

flags.DEFINE_bool("model_name", None, "Check the accepted models list")

bt = BasicTokenizer()


def clean_preprocess(text, arabert_prep):
    text = " ".join(bt._run_split_on_punc(arabert_prep.preprocess(text)))
    text = " ".join(text.split())  # removes extra whitespaces
    return text


def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    logger = tf.get_logger()
    logger.propagate = False

    arabert_prep = ArabertPreprocessor(model_name=FLAGS.model_name,
                                       keep_emojis=False)
Exemplo n.º 10
0
        infile = infile_dir + dt + '_test.tsv'
        outfile = output_dir_used + 'test.tsv'
        gen_data(infile, outfile, tagType)


TESTFLAG = False
#TESTFLAG = True

if __name__ == '__main__':
    #remove_u3000('tmp_input.txt', 'tmp_output.txt')
    #batch_remove_u3000()
    if TESTFLAG:
        vocab_file = '../models/vocab.txt'
        full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
        basic_tokenizer = BasicTokenizer(do_lower_case=True)
        sent = """
            目前 由 232 位 院士 ( Fellow 及 Founding Fellow ) ,
            66 位 協院士 ( Associate Fellow ) 24 位 通信 院士 
            ( Corresponding Fellow ) 及 2 位 通信 協院士 
            ( Corresponding Associate Fellow ) 組成 
            ( 不 包括 一九九四年 當選 者 ) ,
            """
        print(preprocess2dict(sent, 'BIO', full_tokenizer, basic_tokenizer))
        print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer))
        print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer))
        sent = '目前 由 232 位 院士 ( Fellow 及 Founding Fellow ) ,'
        print(preprocess2dict(sent, 'BIO', full_tokenizer, basic_tokenizer))
        print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer))

        infile = 'tmp_output.txt'
Exemplo n.º 11
0
!7z x '/content/drive/My Drive/baidu_ernie.7z' -r -o/content/baidu_ernie

from google.colab import drive
drive.mount('/content/drive')

!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

import jieba
jieba.load_userdict("/content/userdict.txt")
with open('/content/stopwords.txt') as f:
    stopWords = [line.strip() for line in f.readlines()]

import pandas as pd
from tokenization import BasicTokenizer

tokenizer = BasicTokenizer()

df = pd.read_csv('/content/bert_remove.csv')
# 进行分词处理
df['cutted'] = df['comment'].apply(lambda x: tokenizer.tokenize(x))
#df['cutted'] = df['comment'].apply(lambda x: jieba.lcut(x))

# 准备训练测试数据集
train_x = list(df['cutted'][:int(len(df)*0.7)])
train_y = list(df['label'][:int(len(df)*0.7)])

valid_x = list(df['cutted'][int(len(df)*0.7):int(len(df)*0.85)])#
valid_y = list(df['label'][int(len(df)*0.7):int(len(df)*0.85)])#int(len(df)*0.85)

test_x = list(df['cutted'][int(len(df)*0.85):])
test_y = list(df['label'][int(len(df)*0.85):])
Exemplo n.º 12
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info(
                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
                        orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Exemplo n.º 13
0
import sys
from os import path
sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) )

from spacy.tokens import Doc
import spacy
from tokenization import BasicTokenizer

def my_tokenizer(text):
    bert_tokens=basic_tokenizer.tokenize(text) 
    return Doc(nlp.vocab,words=bert_tokens)

nlp=spacy.load('en_core_web_lg')
nlp.tokenizer=my_tokenizer
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
basic_tokenizer = BasicTokenizer(do_lower_case=True,
                                              never_split=never_split)

text='The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.'

spacy_doc=nlp(text)

spacy_tokens=[(t.i,
            # t.text,
            # t.head.text,
            t.head.i) for t in spacy_doc]

# for token in spacy_doc:
#     print(token.text, token.dep_, token.head.text, token.head.pos_,
#             [child for child in token.children])

# for chunk in spacy_doc.noun_chunks:
import json
import os
import re
import string
from collections import OrderedDict, defaultdict
from glob import glob

import cchardet
from bs4 import BeautifulSoup

import ssplit
from tokenization import BasicTokenizer

EXCEPTIONAL_ENTITY_TYPES = {"Protein_domain_or_region", "DNA_domain_or_region"}

BASIC_TOKENIZER = BasicTokenizer(do_lower_case=False)


def generate_sentence_boundaries(doc):
    offsets = []
    for start_offset, end_offset in ssplit.regex_sentence_boundary_gen(doc):
        # Skip empty lines
        if doc[start_offset:end_offset].strip():
            offsets.append((start_offset, end_offset))
    return offsets


def norm_path(*paths):
    return os.path.relpath(os.path.normpath(os.path.join(os.getcwd(), *paths)))