def main(argv): input_dir = argv[1] output_dir = argv[2] tokenizer = BasicTokenizer() if not os.path.exists(output_dir): os.mkdir(output_dir) for folder in os.listdir(input_dir): input_folder = os.path.join(input_dir, folder) output_folder = os.path.join(output_dir, folder) os.mkdir(output_folder) for text in os.listdir(input_folder): fin = open(os.path.join(input_folder, text), 'r', encoding='utf8') fout = open(os.path.join(output_folder, text), 'w', encoding='utf8') for line in fin: if line.startswith('<doc') or line.startswith('</doc'): continue line = line.strip() if not line: continue for sentence in sentence_tokenize(line): fout.write(" ".join(tokenizer.tokenize(sentence)) + '\n')
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """ Project the tokenized prediction back to the original text. """ tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
def tokenize_by_char(file): """ 按照字符切分句子 :param file: txt文本 :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル'] """ basic_tokenizer = BasicTokenizer() token_list = [] with open(file, mode="r", encoding="utf-8") as fin: text = fin.read() str_list = basic_tokenizer.tokenize(text) for str in str_list: token_list += list(str) return token_list
def s_split(): files = _get_text_file() basic_tokenizer = BasicTokenizer(do_lower_case=args.do_lower_case) for file in files.split(","): with open(file + ".sent_splited", 'wt', encoding='utf-8', errors='ignore') as o: print("Processing {}".format(file)) with open(file, 'rt', encoding='utf-8', errors='ignore') as f: for p in f: #Do lower case if required if len(p.strip()) == 0: continue doc_sentences = sent_tokenize(p) #Output segmented sentences for sent in doc_sentences: o.write(' '.join(basic_tokenizer.tokenize(sent)) + "\n") o.write('\n')
def tokenize_and_count(self, file): """ 按照字符切分句子,并统计词频 :param file: txt文本 :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル'] """ basic_tokenizer = BasicTokenizer( do_lower_case=False) # 必须加上do_lower_case=False,这样じ才不会变成し token_list = [] with open(file, mode="r", encoding="utf-8") as fin: text = fin.read() str_list = basic_tokenizer.tokenize(text) for str in str_list: token_list += list(str) # 统计 tmp_count_dict = dict() for token in token_list: if token not in tmp_count_dict: tmp_count_dict[token] = 0 tmp_count_dict[token] += 1 self.q_count_dic.put(tmp_count_dict) # 将统计结果放入序列
def gen_data(in_file, out_file, tagType): with open(in_file, 'r', encoding='utf8') as f: raw_data = [_.strip() for _ in f.readlines()] vocab_file = '../models/vocab.txt' full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True) basic_tokenizer = BasicTokenizer(do_lower_case=True) data_all = [ preprocess2dict(s, tagType, full_tokenizer, basic_tokenizer) for s in tqdm(raw_data) ] df = pd.DataFrame(data_all) # separate with \t df.to_csv(out_file, sep='\t', encoding='utf-8', index=False) print('Finish writing generated ' + tagType + ' data in ' + out_file)
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heruistic between # `pred_text` and `orig_text` to get a character-to-charcter alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info( "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
import os import json import argparse import numpy as np from tqdm import tqdm from tokenization import whitespace_tokenize, BasicTokenizer title_s = "<title>" title_e = "</title>" tokenizer = BasicTokenizer() def save(data, dir_name, data_type): if not os.path.isdir(os.path.join('data', dir_name)): os.makedirs(os.path.join('data', dir_name)) file_path = os.path.join('data', dir_name, '{}.json'.format(data_type)) with open(file_path, 'w') as f: print ("Saving {}".format(file_path)) json.dump({'data': data}, f) def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='hotpotqa') parser.add_argument('--task', type=str, default="hotpot-all") args = parser.parse_args() if args.task == 'hotpot-all': training_data = load_hotpot(args, 'train') save(training_data, 'hotpot-all', 'train')
import json flags = tf.flags FLAGS = flags.FLAGS ## Required parameters flags.DEFINE_string("input_file", None, "The input json file with a SQUAD like structure.") flags.DEFINE_string("output_file", None, "The ouput json file with AraBERT preprocessing applied.") flags.DEFINE_bool("model_name", None, "Check the accepted models list") bt = BasicTokenizer() def clean_preprocess(text, arabert_prep): text = " ".join(bt._run_split_on_punc(arabert_prep.preprocess(text))) text = " ".join(text.split()) # removes extra whitespaces return text def main(_): tf.logging.set_verbosity(tf.logging.INFO) logger = tf.get_logger() logger.propagate = False arabert_prep = ArabertPreprocessor(model_name=FLAGS.model_name, keep_emojis=False)
infile = infile_dir + dt + '_test.tsv' outfile = output_dir_used + 'test.tsv' gen_data(infile, outfile, tagType) TESTFLAG = False #TESTFLAG = True if __name__ == '__main__': #remove_u3000('tmp_input.txt', 'tmp_output.txt') #batch_remove_u3000() if TESTFLAG: vocab_file = '../models/vocab.txt' full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True) basic_tokenizer = BasicTokenizer(do_lower_case=True) sent = """ 目前 由 232 位 院士 ( Fellow 及 Founding Fellow ) , 66 位 協院士 ( Associate Fellow ) 24 位 通信 院士 ( Corresponding Fellow ) 及 2 位 通信 協院士 ( Corresponding Associate Fellow ) 組成 ( 不 包括 一九九四年 當選 者 ) , """ print(preprocess2dict(sent, 'BIO', full_tokenizer, basic_tokenizer)) print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer)) print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer)) sent = '目前 由 232 位 院士 ( Fellow 及 Founding Fellow ) ,' print(preprocess2dict(sent, 'BIO', full_tokenizer, basic_tokenizer)) print(preprocess2dict(sent, 'BMES', full_tokenizer, basic_tokenizer)) infile = 'tmp_output.txt'
!7z x '/content/drive/My Drive/baidu_ernie.7z' -r -o/content/baidu_ernie from google.colab import drive drive.mount('/content/drive') !wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py import jieba jieba.load_userdict("/content/userdict.txt") with open('/content/stopwords.txt') as f: stopWords = [line.strip() for line in f.readlines()] import pandas as pd from tokenization import BasicTokenizer tokenizer = BasicTokenizer() df = pd.read_csv('/content/bert_remove.csv') # 进行分词处理 df['cutted'] = df['comment'].apply(lambda x: tokenizer.tokenize(x)) #df['cutted'] = df['comment'].apply(lambda x: jieba.lcut(x)) # 准备训练测试数据集 train_x = list(df['cutted'][:int(len(df)*0.7)]) train_y = list(df['label'][:int(len(df)*0.7)]) valid_x = list(df['cutted'][int(len(df)*0.7):int(len(df)*0.85)])# valid_y = list(df['label'][int(len(df)*0.7):int(len(df)*0.85)])#int(len(df)*0.85) test_x = list(df['cutted'][int(len(df)*0.85):]) test_y = list(df['label'][int(len(df)*0.85):])
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info( "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
import sys from os import path sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) ) from spacy.tokens import Doc import spacy from tokenization import BasicTokenizer def my_tokenizer(text): bert_tokens=basic_tokenizer.tokenize(text) return Doc(nlp.vocab,words=bert_tokens) nlp=spacy.load('en_core_web_lg') nlp.tokenizer=my_tokenizer never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]") basic_tokenizer = BasicTokenizer(do_lower_case=True, never_split=never_split) text='The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.' spacy_doc=nlp(text) spacy_tokens=[(t.i, # t.text, # t.head.text, t.head.i) for t in spacy_doc] # for token in spacy_doc: # print(token.text, token.dep_, token.head.text, token.head.pos_, # [child for child in token.children]) # for chunk in spacy_doc.noun_chunks:
import json import os import re import string from collections import OrderedDict, defaultdict from glob import glob import cchardet from bs4 import BeautifulSoup import ssplit from tokenization import BasicTokenizer EXCEPTIONAL_ENTITY_TYPES = {"Protein_domain_or_region", "DNA_domain_or_region"} BASIC_TOKENIZER = BasicTokenizer(do_lower_case=False) def generate_sentence_boundaries(doc): offsets = [] for start_offset, end_offset in ssplit.regex_sentence_boundary_gen(doc): # Skip empty lines if doc[start_offset:end_offset].strip(): offsets.append((start_offset, end_offset)) return offsets def norm_path(*paths): return os.path.relpath(os.path.normpath(os.path.join(os.getcwd(), *paths)))