def zh_tokenizer_serialize(zh_tokenizer):
    tokenizer_bytes = zh_tokenizer.to_bytes()
    nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()

    with make_tempdir() as d:
        file_path = d / "tokenizer"
        zh_tokenizer.to_disk(file_path)
        nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def zh_tokenizer_serialize(zh_tokenizer):
    tokenizer_bytes = zh_tokenizer.to_bytes()
    nlp = Chinese()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()

    with make_tempdir() as d:
        file_path = d / "tokenizer"
        zh_tokenizer.to_disk(file_path)
        nlp = Chinese()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
Exemplo n.º 3
0
 def __init__(self, language, **kwargs):
     super().__init__(language, **kwargs)
     import spacy
     language = self.language.ISO_639_1.lower()
     if language == 'zh':
         from spacy.lang.zh import Chinese
         self.nlp = Chinese()
     else:
         self.nlp = spacy.load(language)
Exemplo n.º 4
0
    def __init__(self, language=None):
        import spacy

        self.language = language or languages.ENG

        self.punctuation_table = str.maketrans(
            dict.fromkeys(string.punctuation))
        language = self.language.ISO_639_1.lower()
        if language == 'zh':
            self.nlp = Chinese()
        else:
            self.nlp = spacy.load(language)
Exemplo n.º 5
0
 def __init__(self, language=None):
     import spacy
     self.language = language or languages.ENG
     punc = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
     # punc = punc.decode("utf-8")
     self.punctuation_table = str.maketrans(
         dict.fromkeys(string.punctuation + punc))
     language = self.language.ISO_639_1.lower()
     if language == 'zh':
         self.nlp = Chinese()
     else:
         self.nlp = spacy.load(language)
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
    nlp = Chinese(
        meta={
            "tokenizer": {
                "config": {
                    "use_jieba": False,
                    "use_pkuseg": True,
                    "pkuseg_model": "medicine",
                }
            }
        })
    zh_tokenizer_serialize(nlp.tokenizer)
Exemplo n.º 7
0
def get_clean_word_vector(word):

    from spacy.lang.zh import Chinese
    parser = Chinese()
    default_vector = parser('entity')[0].vector

    parsed = parser(word)
    try:
        vector = parsed[0].vector
        if vector_is_empty(vector):
            vector = default_vector
    except:
        vector = default_vector
    return np.array(vector, dtype=np.float64)
Exemplo n.º 8
0
 def get_tokenizer(lang):
     if lang == "zh":
         # nlp = spacy.load("zh_core_web_sm")
         nlp = Chinese()
     elif lang == "en":
         # nlp = spacy.load("en_core_web_sm")
         nlp = English()
     elif lang == "cs":
         nlp = Czech()
     elif lang == "de":
         # nlp = spacy.load("de_core_web_sm")
         nlp = German()
     elif lang == "ru":
         nlp = Russian()
     else:
         raise Exception("Unacceptable language.")
     return nlp
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
    config = {
        "nlp": {
            "tokenizer": {
                "@tokenizers": "spacy.zh.ChineseTokenizer",
                "segmenter": "pkuseg",
            }
        },
        "initialize": {
            "tokenizer": {
                "pkuseg_model": "medicine",
            }
        },
    }
    nlp = Chinese.from_config(config)
    nlp.initialize()
    zh_tokenizer_serialize(nlp.tokenizer)
Exemplo n.º 10
0
def China_No1():
    try:
        import zh_core_web_sm
        nlp = zh_core_web_sm.load()
        China_No1._log.info(
            "The 'zh_core_web_sm' module has been loaded in order to handle Chinese based on SpaCy."
        )
    except (ModuleNotFoundError, IOError) as e1:
        China_No1._log.error(
            "The 'zh_core_web_sm' module cannot be loaded!\n{}".format(e1))
        from spacy.lang.zh import Chinese
        nlp = Chinese()
    except Exception as e2:
        China_No1._log.critical(
            "Neither the 'en_core_web_sm' nor the 'zh_core_web_sm' module can be loaded!\n{}\n"
            .format(e2))

    import jieba
    # SETTING_FILE = const.SETTING_FILE
    current_path = os.path.dirname(os.getcwd()) + '/'
    CONFIG = ConfigFactory(SETTING_FILE).load_config()
    jieba_dict_path = CONFIG.get("lib", "jieba_dict_path")
    customized_jieba_dict = current_path + jieba_dict_path + CONFIG.get(
        "lib", "jieba_dict_file")
    try:
        # 因为zh_core_web_sm已经错误地指定了如何分词,所以只能强行让jieba加载自定义字典。频率越高,成词的概率就越大。
        # https://github.com/fxsjy/jieba/issues/14
        jieba.load_userdict(customized_jieba_dict)
        China_No1._log.info(
            "The customized jieba dictionary '{}' has been loaded.\n".format(
                customized_jieba_dict))
    except Exception as e3:
        China_No1._log.error(
            "The customized jieba dictionary '{}' cannot be loaded!\n{}\n".
            format(customized_jieba_dict, e3))

    return nlp
def test_zh_uninitialized_pkuseg():
    config = {"nlp": {"tokenizer": {"segmenter": "char"}}}
    nlp = Chinese.from_config(config)
    nlp.tokenizer.segmenter = "pkuseg"
    with pytest.raises(ValueError):
        nlp("test")
Exemplo n.º 12
0
import json
from spacy.lang.zh import Chinese

with open("exercises/zh/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = Chinese()
doc = nlp("智利可能会从斯洛伐克进口货物")

# 导入PhraseMatcher并实例化
from spacy.____ import ____

matcher = ____(____)

# 创建Doc实例的模板然后加入matcher中
# 下面的代码比这样的表达方式更快: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# 在测试文档中调用matcher并打印结果
matches = ____(____)
print([doc[start:end] for match_id, start, end in matches])
Exemplo n.º 13
0
from spacy.lang.zh import Chinese

nlp = Chinese()

# 导入Doc和Span类
from spacy.tokens import Doc, Span

words = ["我", "喜欢", "周", "杰伦"]
spaces = [False, False, False, False]

# 用words和spaces创建一个doc
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# 为doc中的"周杰伦"创建一个span,并赋予其"PERSON"的标签
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# 把这个span加入到doc的实体中
doc.ents = [span]

# 打印所有实体的文本和标签
print([(ent.text, ent.label_) for ent in doc.ents])
Exemplo n.º 14
0
 def __init__(self):
     self.nlp = Chinese()
Exemplo n.º 15
0
import random
import logging
from collections import Counter
import pickle as pkl

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import langdetect

from spacy.lang.ja import Japanese
from spacy.lang.zh import Chinese

jp_nlp = Japanese()
# Jieba
cn_cfg = {"segmenter": "jieba"}
cn_nlp = Chinese.from_config({"nlp": {"tokenizer": cn_cfg}})


def build_idf_vocab(corpus):
    """Build the inverse document frequency(idf) dictionary

    :param corpus: a list of string represent the articles to generate idf dict

    :returns: a dict that maps a word to its idf value
    :rtype: dict(string, float)
    """

    vectorizer = CountVectorizer(vocabulary=None)
    matrix = vectorizer.fit_transform(corpus)
    count = (matrix.toarray() > 0).sum(axis=0)
    words = vectorizer.get_feature_names()
Exemplo n.º 16
0
import json
from spacy.matcher import Matcher
from spacy.lang.zh import Chinese

with open("exercises/zh/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = Chinese()
matcher = Matcher(nlp.vocab)

# 两个词符,其小写形式匹配到"iphone"和"x"上
pattern1 = [{____: ____}, {____: ____}]

# 词符的小写形式匹配到"iphone"和一个数字上
pattern2 = [{____: ____}, {____: ____}]

# 把模板加入到matcher中然后检查结果
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
Exemplo n.º 17
0
def load_dataset(batch_size, debug=True):
    spacy_en = spacy.load('en')
    spacy_zh = Chinese()

    def tokenize_en(line):
        return [token.text for token in spacy_zh.tokenizer(line)]

    def tokenize_zh(line):
        return [token.text for token in spacy_en.tokenizer(line)]

    EN = Field(tokenize=tokenize_en,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')
    ZH = Field(tokenize=tokenize_zh,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')

    lines = open(train_file, 'rt', encoding='utf-8').read().splitlines()
    train_samples = [line.split('\t') for line in lines]
    train_docID, train_senID, train_en, train_zh = zip(*train_samples)

    val_docID, val_senID, val_en = extract_data_from_sgm(val_en_file, cols=3)
    val_zh, = extract_data_from_sgm(val_zh_file, cols=1)

    test_docID, test_senID, test_en = extract_data_from_sgm(test_en_file,
                                                            cols=3)

    if debug:
        debug_info_size = 10
        print('\n[Debug] First %d training examples:\n' % debug_info_size)
        for i in range(debug_info_size):
            print(train_docID[i], train_senID[i], train_en[i], train_zh[i])
        print('\n[Debug] First %d validation examples:\n' % debug_info_size)
        for i in range(debug_info_size):
            print(val_docID[i], val_senID[i], val_en[i], val_zh[i])
        print('\n[Debug] First %d test examples:\n' % debug_info_size)
        for i in range(debug_info_size):
            print(test_en[i])

    train_examples = [
        sentence_translation(train_docID[i], train_senID[i], train_en[i],
                             train_zh[i]) for i in range(len(train_docID))
    ]
    val_examples = [
        sentence_translation(val_docID[i], val_senID[i], val_en[i], val_zh[i])
        for i in range(len(val_docID))
    ]

    print("Train size = %d" % len(train_examples))
    print("Eval size = %d" % len(val_examples))

    train_dataset = Dataset(train_examples, {'src': EN, 'trg': ZH})
    val_dataset = Dataset(val_examples, {'src': EN, 'trg': ZH})
    print('Datasets Built!')

    EN.build_vocab(train_dataset.src, min_freq=2)
    ZH.build_vocab(train_dataset.trg, max_size=10000)
    print('Vocabularies Built!')

    train_iter, val_iter = BucketIterator.splits(
        (train_dataset, val_dataset),
        batch_size=batch_size,
        repeat=False,
        sort_key=lambda x: interleave_keys(len(x.src), len(x.trg)))
    print('Training Iterators Built!')
    return train_iter, val_iter, ZH, EN
def test_zh_unsupported_segmenter():
    config = {"nlp": {"tokenizer": {"segmenter": "unk"}}}
    with pytest.raises(ConfigValidationError):
        Chinese.from_config(config)
Exemplo n.º 19
0
import json
from spacy.lang.zh import Chinese
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/zh/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/zh/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Chinese()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # 对所有匹配结果创建一个标签为"GPE"的实体Span
    matches = matcher(doc)
    doc.ents = [____(____, ____, ____, label=____) for match_id, start, end in matches]
    return doc


# 把这个组件加入到流程中
____.____(____)
print(nlp.pipe_names)

# 取值器,在国家首都的字典中寻找span的文本
get_capital = lambda span: CAPITALS.get(span.text)

# 用这个取值器注册Span的扩展属性"capital"
                    start_char, end_char = ent_arrays[0]
                    label_ent_array.append((start_char, end_char + 1, l))
                ents.append(label_ent_array[0])

            if True == diff_contain_overlapping(ents):
                i = i + 1

                doc = nlp(text)
                tags = biluo_tags_from_offsets(doc, ents)
                doc.ents = spans_from_biluo_tags(doc, tags)

                line = docs_to_json([doc])
                f.write(json_dumps(line) + "\n")

    msg.good(f"Finished {file_path} :: {i} rows")
    if print_label:
        msg.info(f"{labels}")


if __name__ == "__main__":
    # Chinese.Defaults.use_jieba = True
    nlp = Chinese()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))

    dev_data = read_jsonl(Path("./cluener2020/dev.json"))
    train_data = read_jsonl(Path("./cluener2020/train.json"))

    format_data_to_jsonl(dev_data, Path("./clue_spacy_dev.jsonl"))
    format_data_to_jsonl(train_data,
                         Path("./clue_spacy_train.jsonl"),
                         print_label=True)
Exemplo n.º 21
0
import json
from spacy.lang.zh import Chinese
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/zh/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/zh/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Chinese()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # 对所有匹配结果创建一个标签为"GPE"的实体Span
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="GPE") for match_id, start, end in matches
    ]
    return doc


# 把这个组件加入到流程中
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# 取值器,在国家首都的字典中寻找span的文本
get_capital = lambda span: CAPITALS.get(span.text)
Exemplo n.º 22
0
import torch
import torch.nn as nn
from torchtext import vocab
from torchtext.data import NestedField, Field, Pipeline, TabularDataset, BucketIterator
import torch.nn.functional as F

import numpy as np
import os
import re
from tqdm import tqdm

import spacy
from spacy.lang.zh import Chinese

nlp = spacy.load("en", disable=["parser", "tagger", "ner"])
nlp_chinese = Chinese()

__author__ = "Serena Khoo"


class DataLoader():
    """
	This is the dataloader class that takes in a path and return a generator that could be iterated through

	init:
		path: path of the data to read in (assumes CSV format)
		config: a Config object that contains the parameters to be used
		shuffle: whether to shuffle the data or not (true by default)

	"""
    def __init__(self, config, split, type_="train", lang="en"):
Exemplo n.º 23
0
# Spacy
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.fr import French
from spacy.lang.zh import Chinese
from spacy.lang.ru import Russian
from spacy.lang.ar import Arabic
from spacy.lang.de import German
from spacy.lang.uk import Ukrainian
from spacy.lang.ro import Romanian

lang_id_to_spacy = {
    'en': English(),
    'es': Spanish(),
    'fr': French(),
    'zh-cn': Chinese(),
    'ru': Russian(),
    'ar': Arabic(),
    'de': German(),
    'uk': Ukrainian(),
    'ro': Romanian()
}

#####################
### Globals
#####################

reddit = Reddit(client_id='OFsSWAsbFrzLpg',
                client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
                password='******',
                user_agent='testscript by /u/pocaguirre',
Exemplo n.º 24
0
from spacy.lang.ca import Catalan
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
                 remove_stopwords=True,
                 lowercase=True,
                 strip_accents=None,
                 ngram_range=(1, 1),
                 min_freq=1,