def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): config = { "nlp": { "tokenizer": { "@tokenizers": "spacy.zh.ChineseTokenizer", "segmenter": "pkuseg", } }, "initialize": { "tokenizer": { "pkuseg_model": "medicine", } }, } nlp = Chinese.from_config(config) nlp.initialize() zh_tokenizer_serialize(nlp.tokenizer)
import random import logging from collections import Counter import pickle as pkl import numpy as np from sklearn.feature_extraction.text import CountVectorizer import langdetect from spacy.lang.ja import Japanese from spacy.lang.zh import Chinese jp_nlp = Japanese() # Jieba cn_cfg = {"segmenter": "jieba"} cn_nlp = Chinese.from_config({"nlp": {"tokenizer": cn_cfg}}) def build_idf_vocab(corpus): """Build the inverse document frequency(idf) dictionary :param corpus: a list of string represent the articles to generate idf dict :returns: a dict that maps a word to its idf value :rtype: dict(string, float) """ vectorizer = CountVectorizer(vocabulary=None) matrix = vectorizer.fit_transform(corpus) count = (matrix.toarray() > 0).sum(axis=0) words = vectorizer.get_feature_names()
def test_zh_uninitialized_pkuseg(): config = {"nlp": {"tokenizer": {"segmenter": "char"}}} nlp = Chinese.from_config(config) nlp.tokenizer.segmenter = "pkuseg" with pytest.raises(ValueError): nlp("test")
def test_zh_unsupported_segmenter(): config = {"nlp": {"tokenizer": {"segmenter": "unk"}}} with pytest.raises(ConfigValidationError): Chinese.from_config(config)