def separate(nlp, s, stop_file=None): separate._log.debug("\nThe outcomes of separation '{}' are:".format(s)) # 因为需要nlp.remove_pipe()和nlp.add_pipe(SS),所以必须重新import zh、不能从外部传参进来,否则会有ValueError。泥马这个坑爹bug花了哥一个上午才搞定!泥马坑爹! import zh_core_web_sm nlp = zh_core_web_sm.load() from spacy.pipeline import SentenceSegmenter def split_on_punctuation(doc): punctuation = ",;,;、和与" # punctuation = re.compile(r",.:;?!,。:;?!") start = 0 whether_segmenter = False for word in doc: if whether_segmenter or word.is_space: # and not word.is_space! yield doc[start:word.i] start = word.i whether_segmenter = False elif word.text in punctuation: whether_segmenter = True if start < len(doc): yield doc[start:len(doc)] SS = SentenceSegmenter(nlp.vocab, strategy=split_on_punctuation) nlp.add_pipe(SS) doc = nlp(s) for sent in doc.sents: separate._log.debug("\t{}".format(sent.text)) return doc
class Chinese_Weibo_Analyze(object): """ Create a Weibo parser """ nlp = zh_core_web_sm.load() def __init__(self, weibo: str): """ :param weibo: A Weibo string """ self.weibo = weibo @property def parse_chinese(self): """ Get the nouns and verbs from a Weibo string :return: nouns list and verb list """ regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)" \ r"))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" weibo_without_url = re.sub(regex, '', self.weibo) doc = Chinese_Weibo_Analyze.nlp(weibo_without_url) nouns_list = list(set([token.lemma_ for token in doc if token.pos_ == "NOUN"])) verbs_list = list(set([token.lemma_ for token in doc if token.pos_ == "VERB"])) stopwords_list = create_stopwords_set(stopword_path=data_paths.chinese_stopword_path) nouns_list_without_stopwords = [word for word in nouns_list if word not in stopwords_list] verbs_list_without_stopwords = [word for word in verbs_list if word not in stopwords_list] return nouns_list_without_stopwords, verbs_list_without_stopwords
def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). """ max_length = kwargs.get('max_length', 10**6) nlp = spacy.load(self.language, max_length=max_length) if self.language != 'zh' else zh_core_web_sm.load() spacy_doc = nlp(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], "POS": [token.pos_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get('input_file', None), **kwargs) return doc
def test_zh(): import zh_core_web_sm nlp = zh_core_web_sm.load() doc = nlp("我去食堂吃饭") print(">>tokens") for token in doc: print(token) print(">>sents") print(list(doc.sents)) print(">>tags") for w in doc: print(w.pos, w.pos_) print(">>ents") print(list(doc.ents))
def China_No1(): try: import zh_core_web_sm nlp = zh_core_web_sm.load() China_No1._log.info( "The 'zh_core_web_sm' module has been loaded in order to handle Chinese based on SpaCy." ) except (ModuleNotFoundError, IOError) as e1: China_No1._log.error( "The 'zh_core_web_sm' module cannot be loaded!\n{}".format(e1)) from spacy.lang.zh import Chinese nlp = Chinese() except Exception as e2: China_No1._log.critical( "Neither the 'en_core_web_sm' nor the 'zh_core_web_sm' module can be loaded!\n{}\n" .format(e2)) import jieba # SETTING_FILE = const.SETTING_FILE current_path = os.path.dirname(os.getcwd()) + '/' CONFIG = ConfigFactory(SETTING_FILE).load_config() jieba_dict_path = CONFIG.get("lib", "jieba_dict_path") customized_jieba_dict = current_path + jieba_dict_path + CONFIG.get( "lib", "jieba_dict_file") try: # 因为zh_core_web_sm已经错误地指定了如何分词,所以只能强行让jieba加载自定义字典。频率越高,成词的概率就越大。 # https://github.com/fxsjy/jieba/issues/14 jieba.load_userdict(customized_jieba_dict) China_No1._log.info( "The customized jieba dictionary '{}' has been loaded.\n".format( customized_jieba_dict)) except Exception as e3: China_No1._log.error( "The customized jieba dictionary '{}' cannot be loaded!\n{}\n". format(customized_jieba_dict, e3)) return nlp
from nltk.stem.lancaster import LancasterStemmer import re import os import string import spacy import zh_core_web_sm from spacy.lang.en.stop_words import STOP_WORDS import data_paths punctuations_all = string.punctuation + '。,﹑?!:;“”()《》•……【】' nlp = zh_core_web_sm.load() traffic_word_set = {'堵', '拥堵', '车祸', '剐蹭', '事故', '绕行', '追尾', '相撞', '塞车', '路况'} def create_stopwords_set(stopword_path: str) -> list: """ Create the Chinese stopword list :param stopword_path: the path which contains the stopword :return: a Chinese stopword list """ stopwords_list = [] with open(os.path.join(stopword_path, 'hit_stopwords.txt'), 'r', encoding='utf-8') as stopword_file: for line in stopword_file: line = line.replace("\r", "").replace("\n", "") stopwords_list.append(line) return stopwords_list
import zh_core_web_sm from language_service.dto.word import Word parser = zh_core_web_sm.load() def tag_chinese(text): # Lemmatization is not meaningful for Chinese because there are no conjugations. return [ Word(token=word.text, tag=word.pos_, lemma=word.text) for word in parser(text) ]