def zh_tokenizer_serialize(zh_tokenizer): tokenizer_bytes = zh_tokenizer.to_bytes() nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() with make_tempdir() as d: file_path = d / "tokenizer" zh_tokenizer.to_disk(file_path) nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def zh_tokenizer_serialize(zh_tokenizer): tokenizer_bytes = zh_tokenizer.to_bytes() nlp = Chinese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() with make_tempdir() as d: file_path = d / "tokenizer" zh_tokenizer.to_disk(file_path) nlp = Chinese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def __init__(self, language, **kwargs): super().__init__(language, **kwargs) import spacy language = self.language.ISO_639_1.lower() if language == 'zh': from spacy.lang.zh import Chinese self.nlp = Chinese() else: self.nlp = spacy.load(language)
def __init__(self, language=None): import spacy self.language = language or languages.ENG self.punctuation_table = str.maketrans( dict.fromkeys(string.punctuation)) language = self.language.ISO_639_1.lower() if language == 'zh': self.nlp = Chinese() else: self.nlp = spacy.load(language)
def __init__(self, language=None): import spacy self.language = language or languages.ENG punc = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # punc = punc.decode("utf-8") self.punctuation_table = str.maketrans( dict.fromkeys(string.punctuation + punc)) language = self.language.ISO_639_1.lower() if language == 'zh': self.nlp = Chinese() else: self.nlp = spacy.load(language)
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): nlp = Chinese( meta={ "tokenizer": { "config": { "use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine", } } }) zh_tokenizer_serialize(nlp.tokenizer)
def get_clean_word_vector(word): from spacy.lang.zh import Chinese parser = Chinese() default_vector = parser('entity')[0].vector parsed = parser(word) try: vector = parsed[0].vector if vector_is_empty(vector): vector = default_vector except: vector = default_vector return np.array(vector, dtype=np.float64)
def get_tokenizer(lang): if lang == "zh": # nlp = spacy.load("zh_core_web_sm") nlp = Chinese() elif lang == "en": # nlp = spacy.load("en_core_web_sm") nlp = English() elif lang == "cs": nlp = Czech() elif lang == "de": # nlp = spacy.load("de_core_web_sm") nlp = German() elif lang == "ru": nlp = Russian() else: raise Exception("Unacceptable language.") return nlp
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): config = { "nlp": { "tokenizer": { "@tokenizers": "spacy.zh.ChineseTokenizer", "segmenter": "pkuseg", } }, "initialize": { "tokenizer": { "pkuseg_model": "medicine", } }, } nlp = Chinese.from_config(config) nlp.initialize() zh_tokenizer_serialize(nlp.tokenizer)
def China_No1(): try: import zh_core_web_sm nlp = zh_core_web_sm.load() China_No1._log.info( "The 'zh_core_web_sm' module has been loaded in order to handle Chinese based on SpaCy." ) except (ModuleNotFoundError, IOError) as e1: China_No1._log.error( "The 'zh_core_web_sm' module cannot be loaded!\n{}".format(e1)) from spacy.lang.zh import Chinese nlp = Chinese() except Exception as e2: China_No1._log.critical( "Neither the 'en_core_web_sm' nor the 'zh_core_web_sm' module can be loaded!\n{}\n" .format(e2)) import jieba # SETTING_FILE = const.SETTING_FILE current_path = os.path.dirname(os.getcwd()) + '/' CONFIG = ConfigFactory(SETTING_FILE).load_config() jieba_dict_path = CONFIG.get("lib", "jieba_dict_path") customized_jieba_dict = current_path + jieba_dict_path + CONFIG.get( "lib", "jieba_dict_file") try: # 因为zh_core_web_sm已经错误地指定了如何分词,所以只能强行让jieba加载自定义字典。频率越高,成词的概率就越大。 # https://github.com/fxsjy/jieba/issues/14 jieba.load_userdict(customized_jieba_dict) China_No1._log.info( "The customized jieba dictionary '{}' has been loaded.\n".format( customized_jieba_dict)) except Exception as e3: China_No1._log.error( "The customized jieba dictionary '{}' cannot be loaded!\n{}\n". format(customized_jieba_dict, e3)) return nlp
def test_zh_uninitialized_pkuseg(): config = {"nlp": {"tokenizer": {"segmenter": "char"}}} nlp = Chinese.from_config(config) nlp.tokenizer.segmenter = "pkuseg" with pytest.raises(ValueError): nlp("test")
import json from spacy.lang.zh import Chinese with open("exercises/zh/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) nlp = Chinese() doc = nlp("智利可能会从斯洛伐克进口货物") # 导入PhraseMatcher并实例化 from spacy.____ import ____ matcher = ____(____) # 创建Doc实例的模板然后加入matcher中 # 下面的代码比这样的表达方式更快: [nlp(country) for country in COUNTRIES] patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", None, *patterns) # 在测试文档中调用matcher并打印结果 matches = ____(____) print([doc[start:end] for match_id, start, end in matches])
from spacy.lang.zh import Chinese nlp = Chinese() # 导入Doc和Span类 from spacy.tokens import Doc, Span words = ["我", "喜欢", "周", "杰伦"] spaces = [False, False, False, False] # 用words和spaces创建一个doc doc = Doc(nlp.vocab, words=words, spaces=spaces) print(doc.text) # 为doc中的"周杰伦"创建一个span,并赋予其"PERSON"的标签 span = Span(doc, 2, 4, label="PERSON") print(span.text, span.label_) # 把这个span加入到doc的实体中 doc.ents = [span] # 打印所有实体的文本和标签 print([(ent.text, ent.label_) for ent in doc.ents])
def __init__(self): self.nlp = Chinese()
import random import logging from collections import Counter import pickle as pkl import numpy as np from sklearn.feature_extraction.text import CountVectorizer import langdetect from spacy.lang.ja import Japanese from spacy.lang.zh import Chinese jp_nlp = Japanese() # Jieba cn_cfg = {"segmenter": "jieba"} cn_nlp = Chinese.from_config({"nlp": {"tokenizer": cn_cfg}}) def build_idf_vocab(corpus): """Build the inverse document frequency(idf) dictionary :param corpus: a list of string represent the articles to generate idf dict :returns: a dict that maps a word to its idf value :rtype: dict(string, float) """ vectorizer = CountVectorizer(vocabulary=None) matrix = vectorizer.fit_transform(corpus) count = (matrix.toarray() > 0).sum(axis=0) words = vectorizer.get_feature_names()
import json from spacy.matcher import Matcher from spacy.lang.zh import Chinese with open("exercises/zh/iphone.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = Chinese() matcher = Matcher(nlp.vocab) # 两个词符,其小写形式匹配到"iphone"和"x"上 pattern1 = [{____: ____}, {____: ____}] # 词符的小写形式匹配到"iphone"和一个数字上 pattern2 = [{____: ____}, {____: ____}] # 把模板加入到matcher中然后检查结果 matcher.add("GADGET", None, pattern1, pattern2) for doc in nlp.pipe(TEXTS): print([doc[start:end] for match_id, start, end in matcher(doc)])
def load_dataset(batch_size, debug=True): spacy_en = spacy.load('en') spacy_zh = Chinese() def tokenize_en(line): return [token.text for token in spacy_zh.tokenizer(line)] def tokenize_zh(line): return [token.text for token in spacy_en.tokenizer(line)] EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') ZH = Field(tokenize=tokenize_zh, include_lengths=True, init_token='<sos>', eos_token='<eos>') lines = open(train_file, 'rt', encoding='utf-8').read().splitlines() train_samples = [line.split('\t') for line in lines] train_docID, train_senID, train_en, train_zh = zip(*train_samples) val_docID, val_senID, val_en = extract_data_from_sgm(val_en_file, cols=3) val_zh, = extract_data_from_sgm(val_zh_file, cols=1) test_docID, test_senID, test_en = extract_data_from_sgm(test_en_file, cols=3) if debug: debug_info_size = 10 print('\n[Debug] First %d training examples:\n' % debug_info_size) for i in range(debug_info_size): print(train_docID[i], train_senID[i], train_en[i], train_zh[i]) print('\n[Debug] First %d validation examples:\n' % debug_info_size) for i in range(debug_info_size): print(val_docID[i], val_senID[i], val_en[i], val_zh[i]) print('\n[Debug] First %d test examples:\n' % debug_info_size) for i in range(debug_info_size): print(test_en[i]) train_examples = [ sentence_translation(train_docID[i], train_senID[i], train_en[i], train_zh[i]) for i in range(len(train_docID)) ] val_examples = [ sentence_translation(val_docID[i], val_senID[i], val_en[i], val_zh[i]) for i in range(len(val_docID)) ] print("Train size = %d" % len(train_examples)) print("Eval size = %d" % len(val_examples)) train_dataset = Dataset(train_examples, {'src': EN, 'trg': ZH}) val_dataset = Dataset(val_examples, {'src': EN, 'trg': ZH}) print('Datasets Built!') EN.build_vocab(train_dataset.src, min_freq=2) ZH.build_vocab(train_dataset.trg, max_size=10000) print('Vocabularies Built!') train_iter, val_iter = BucketIterator.splits( (train_dataset, val_dataset), batch_size=batch_size, repeat=False, sort_key=lambda x: interleave_keys(len(x.src), len(x.trg))) print('Training Iterators Built!') return train_iter, val_iter, ZH, EN
def test_zh_unsupported_segmenter(): config = {"nlp": {"tokenizer": {"segmenter": "unk"}}} with pytest.raises(ConfigValidationError): Chinese.from_config(config)
import json from spacy.lang.zh import Chinese from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/zh/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/zh/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = Chinese() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # 对所有匹配结果创建一个标签为"GPE"的实体Span matches = matcher(doc) doc.ents = [____(____, ____, ____, label=____) for match_id, start, end in matches] return doc # 把这个组件加入到流程中 ____.____(____) print(nlp.pipe_names) # 取值器,在国家首都的字典中寻找span的文本 get_capital = lambda span: CAPITALS.get(span.text) # 用这个取值器注册Span的扩展属性"capital"
start_char, end_char = ent_arrays[0] label_ent_array.append((start_char, end_char + 1, l)) ents.append(label_ent_array[0]) if True == diff_contain_overlapping(ents): i = i + 1 doc = nlp(text) tags = biluo_tags_from_offsets(doc, ents) doc.ents = spans_from_biluo_tags(doc, tags) line = docs_to_json([doc]) f.write(json_dumps(line) + "\n") msg.good(f"Finished {file_path} :: {i} rows") if print_label: msg.info(f"{labels}") if __name__ == "__main__": # Chinese.Defaults.use_jieba = True nlp = Chinese() nlp.add_pipe(nlp.create_pipe('sentencizer')) dev_data = read_jsonl(Path("./cluener2020/dev.json")) train_data = read_jsonl(Path("./cluener2020/train.json")) format_data_to_jsonl(dev_data, Path("./clue_spacy_dev.jsonl")) format_data_to_jsonl(train_data, Path("./clue_spacy_train.jsonl"), print_label=True)
import json from spacy.lang.zh import Chinese from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/zh/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/zh/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = Chinese() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # 对所有匹配结果创建一个标签为"GPE"的实体Span matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="GPE") for match_id, start, end in matches ] return doc # 把这个组件加入到流程中 nlp.add_pipe(countries_component) print(nlp.pipe_names) # 取值器,在国家首都的字典中寻找span的文本 get_capital = lambda span: CAPITALS.get(span.text)
import torch import torch.nn as nn from torchtext import vocab from torchtext.data import NestedField, Field, Pipeline, TabularDataset, BucketIterator import torch.nn.functional as F import numpy as np import os import re from tqdm import tqdm import spacy from spacy.lang.zh import Chinese nlp = spacy.load("en", disable=["parser", "tagger", "ner"]) nlp_chinese = Chinese() __author__ = "Serena Khoo" class DataLoader(): """ This is the dataloader class that takes in a path and return a generator that could be iterated through init: path: path of the data to read in (assumes CSV format) config: a Config object that contains the parameters to be used shuffle: whether to shuffle the data or not (true by default) """ def __init__(self, config, split, type_="train", lang="en"):
# Spacy from spacy.lang.en import English from spacy.lang.es import Spanish from spacy.lang.fr import French from spacy.lang.zh import Chinese from spacy.lang.ru import Russian from spacy.lang.ar import Arabic from spacy.lang.de import German from spacy.lang.uk import Ukrainian from spacy.lang.ro import Romanian lang_id_to_spacy = { 'en': English(), 'es': Spanish(), 'fr': French(), 'zh-cn': Chinese(), 'ru': Russian(), 'ar': Arabic(), 'de': German(), 'uk': Ukrainian(), 'ro': Romanian() } ##################### ### Globals ##################### reddit = Reddit(client_id='OFsSWAsbFrzLpg', client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4', password='******', user_agent='testscript by /u/pocaguirre',
from spacy.lang.ca import Catalan from spacy.lang.eu import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy', remove_stopwords=True, lowercase=True, strip_accents=None, ngram_range=(1, 1), min_freq=1,