Python BasicTokenizer примеры, tokenization.BasicTokenizer Python примеры использования

Пример #1

0

Показать файл

Файл: wiki_to_sentence.py Проект: qZhang88/wiki_word2vec

def main(argv):
    input_dir = argv[1]
    output_dir = argv[2]
    tokenizer = BasicTokenizer()

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    for folder in os.listdir(input_dir):
        input_folder = os.path.join(input_dir, folder)
        output_folder = os.path.join(output_dir, folder)
        os.mkdir(output_folder)

        for text in os.listdir(input_folder):
            fin = open(os.path.join(input_folder, text), 'r', encoding='utf8')
            fout = open(os.path.join(output_folder, text),
                        'w',
                        encoding='utf8')
            for line in fin:
                if line.startswith('<doc') or line.startswith('</doc'):
                    continue

                line = line.strip()
                if not line:
                    continue

                for sentence in sentence_tokenize(line):
                    fout.write(" ".join(tokenizer.tokenize(sentence)) + '\n')

Пример #2

0

Показать файл

Файл: utils.py Проект: chendeguang/Multi_Segment

def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """
    Project the tokenized prediction back to the original text.
    """
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text

Пример #3

0

Показать файл

 def tokenize_by_char(file):
     """
     按照字符切分句子
     :param file: txt文本
     :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル']
     """
     basic_tokenizer = BasicTokenizer()
     token_list = []
     with open(file, mode="r", encoding="utf-8") as fin:
         text = fin.read()
         str_list = basic_tokenizer.tokenize(text)
         for str in str_list:
             token_list += list(str)
     return token_list

Пример #4

0

Показать файл

Файл: sentence-split-nltk.py Проект: zmwebdev/Blax

def s_split():
    files = _get_text_file()
    basic_tokenizer = BasicTokenizer(do_lower_case=args.do_lower_case)
    for file in files.split(","):
        with open(file + ".sent_splited",
                  'wt',
                  encoding='utf-8',
                  errors='ignore') as o:
            print("Processing {}".format(file))
            with open(file, 'rt', encoding='utf-8', errors='ignore') as f:
                for p in f:
                    #Do lower case if required
                    if len(p.strip()) == 0:
                        continue
                    doc_sentences = sent_tokenize(p)
                    #Output segmented sentences
                    for sent in doc_sentences:
                        o.write(' '.join(basic_tokenizer.tokenize(sent)) +
                                "\n")
                    o.write('\n')

Пример #5

0

Показать файл

Файл: build_vocabulary_4.py Проект: kenzzuli/train_bert_model_from_scratch

 def tokenize_and_count(self, file):
     """
     按照字符切分句子，并统计词频
     :param file: txt文本
     :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル']
     """
     basic_tokenizer = BasicTokenizer(
         do_lower_case=False)  # 必须加上do_lower_case=False,这样じ才不会变成し
     token_list = []
     with open(file, mode="r", encoding="utf-8") as fin:
         text = fin.read()
         str_list = basic_tokenizer.tokenize(text)
         for str in str_list:
             token_list += list(str)
     # 统计
     tmp_count_dict = dict()
     for token in token_list:
         if token not in tmp_count_dict:
             tmp_count_dict[token] = 0
         tmp_count_dict[token] += 1
     self.q_count_dic.put(tmp_count_dict)  # 将统计结果放入序列

Пример #6

0

Показать файл

def gen_data(in_file, out_file, tagType):
    with open(in_file, 'r', encoding='utf8') as f:
        raw_data = [_.strip() for _ in f.readlines()]

    vocab_file = '../models/vocab.txt'
    full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
    basic_tokenizer = BasicTokenizer(do_lower_case=True)

    data_all = [
        preprocess2dict(s, tagType, full_tokenizer, basic_tokenizer)
        for s in tqdm(raw_data)
    ]

    df = pd.DataFrame(data_all)
    # separate with \t
    df.to_csv(out_file, sep='\t', encoding='utf-8', index=False)

    print('Finish writing generated ' + tagType + ' data in ' + out_file)

Пример #7

0

Показать файл

Файл: run_squad2_test.py Проект: wilburOne/commonsenseqa

def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info(
                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
                            orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text

Пример #8

0

Показать файл

import os
import json
import argparse

import numpy as np
from tqdm import tqdm
from tokenization import whitespace_tokenize, BasicTokenizer


title_s = "<title>"
title_e = "</title>"
tokenizer = BasicTokenizer()

def save(data, dir_name, data_type):
    if not os.path.isdir(os.path.join('data', dir_name)):
        os.makedirs(os.path.join('data', dir_name))

    file_path = os.path.join('data', dir_name, '{}.json'.format(data_type))
    with open(file_path, 'w') as f:
        print ("Saving {}".format(file_path))
        json.dump({'data': data}, f)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='hotpotqa')
    parser.add_argument('--task', type=str, default="hotpot-all")
    args = parser.parse_args()

    if args.task == 'hotpot-all':
        training_data = load_hotpot(args, 'train')
        save(training_data, 'hotpot-all', 'train')

Пример #9

0

Показать файл

Файл: arcd_preprocessing.py Проект: srulikbd/arabert

import json

flags = tf.flags
FLAGS = flags.FLAGS

## Required parameters
flags.DEFINE_string("input_file", None,
                    "The input json file with a SQUAD like structure.")

flags.DEFINE_string("output_file", None,
                    "The ouput json file with AraBERT preprocessing applied.")

flags.DEFINE_bool("model_name", None, "Check the accepted models list")

bt = BasicTokenizer()


def clean_preprocess(text, arabert_prep):
    text = " ".join(bt._run_split_on_punc(arabert_prep.preprocess(text)))
    text = " ".join(text.split())  # removes extra whitespaces
    return text


def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    logger = tf.get_logger()
    logger.propagate = False

    arabert_prep = ArabertPreprocessor(model_name=FLAGS.model_name,
                                       keep_emojis=False)

Пример #10

0

Показать файл

        infile = infile_dir + dt + '_test.tsv'
        outfile = output_dir_used + 'test.tsv'
        gen_data(infile, outfile, tagType)


TESTFLAG = False
#TESTFLAG = True

if __name__ == '__main__':
    #remove_u3000('tmp_input.txt', 'tmp_output.txt')
    #batch_remove_u3000()
    if TESTFLAG:
        vocab_file = '../models/vocab.txt'
        full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
        basic_tokenizer = BasicTokenizer(do_lower_case=True)
        sent = """
            目前　由　２３２　位　院士　（　Ｆｅｌｌｏｗ　及　Ｆｏｕｎｄｉｎｇ　Ｆｅｌｌｏｗ　）　，
            ６６　位　協院士　（　Ａｓｓｏｃｉａｔｅ　Ｆｅｌｌｏｗ　）　２４　位　通信　院士　
            （　Ｃｏｒｒｅｓｐｏｎｄｉｎｇ　Ｆｅｌｌｏｗ　）　及　２　位　通信　協院士　
            （　Ｃｏｒｒｅｓｐｏｎｄｉｎｇ　Ａｓｓｏｃｉａｔｅ　Ｆｅｌｌｏｗ　）　組成　
            （　不　包括　一九九四年　當選　者　）　，
            """
        print(preprocess2dict(sent, 'BIO', full_tokenizer, basic_tokenizer))
        print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer))
        print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer))
        sent = '目前 由 ２３２ 位 院士 （ Ｆｅｌｌｏｗ 及 Ｆｏｕｎｄｉｎｇ Ｆｅｌｌｏｗ ） ，'
        print(preprocess2dict(sent, 'BIO', full_tokenizer, basic_tokenizer))
        print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer))

        infile = 'tmp_output.txt'

Пример #11

0

Показать файл

!7z x '/content/drive/My Drive/baidu_ernie.7z' -r -o/content/baidu_ernie

from google.colab import drive
drive.mount('/content/drive')

!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

import jieba
jieba.load_userdict("/content/userdict.txt")
with open('/content/stopwords.txt') as f:
    stopWords = [line.strip() for line in f.readlines()]

import pandas as pd
from tokenization import BasicTokenizer

tokenizer = BasicTokenizer()

df = pd.read_csv('/content/bert_remove.csv')
# 进行分词处理
df['cutted'] = df['comment'].apply(lambda x: tokenizer.tokenize(x))
#df['cutted'] = df['comment'].apply(lambda x: jieba.lcut(x))

# 准备训练测试数据集
train_x = list(df['cutted'][:int(len(df)*0.7)])
train_y = list(df['label'][:int(len(df)*0.7)])

valid_x = list(df['cutted'][int(len(df)*0.7):int(len(df)*0.85)])#
valid_y = list(df['label'][int(len(df)*0.7):int(len(df)*0.85)])#int(len(df)*0.85)

test_x = list(df['cutted'][int(len(df)*0.85):])
test_y = list(df['label'][int(len(df)*0.85):])

Пример #12

0

Показать файл

Файл: squad_metrics.py Проект: quuhua911/DeepLearningExamples

def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info(
                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
                        orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text

Пример #13

0

Показать файл

import sys
from os import path
sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) )

from spacy.tokens import Doc
import spacy
from tokenization import BasicTokenizer

def my_tokenizer(text):
    bert_tokens=basic_tokenizer.tokenize(text) 
    return Doc(nlp.vocab,words=bert_tokens)

nlp=spacy.load('en_core_web_lg')
nlp.tokenizer=my_tokenizer
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
basic_tokenizer = BasicTokenizer(do_lower_case=True,
                                              never_split=never_split)

text='The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.'

spacy_doc=nlp(text)

spacy_tokens=[(t.i,
            # t.text,
            # t.head.text,
            t.head.i) for t in spacy_doc]

# for token in spacy_doc:
#     print(token.text, token.dep_, token.head.text, token.head.pos_,
#             [child for child in token.children])

# for chunk in spacy_doc.noun_chunks:

Пример #14

0

Показать файл

Файл: preprocess.py Проект: dnanhkhoa/brat-standoff-preprocessing

import json
import os
import re
import string
from collections import OrderedDict, defaultdict
from glob import glob

import cchardet
from bs4 import BeautifulSoup

import ssplit
from tokenization import BasicTokenizer

EXCEPTIONAL_ENTITY_TYPES = {"Protein_domain_or_region", "DNA_domain_or_region"}

BASIC_TOKENIZER = BasicTokenizer(do_lower_case=False)


def generate_sentence_boundaries(doc):
    offsets = []
    for start_offset, end_offset in ssplit.regex_sentence_boundary_gen(doc):
        # Skip empty lines
        if doc[start_offset:end_offset].strip():
            offsets.append((start_offset, end_offset))
    return offsets


def norm_path(*paths):
    return os.path.relpath(os.path.normpath(os.path.join(os.getcwd(), *paths)))

Python BasicTokenizer примеры использования