예제 #1
0
def separate(nlp, s, stop_file=None):
    separate._log.debug("\nThe outcomes of separation '{}' are:".format(s))
    # 因为需要nlp.remove_pipe()和nlp.add_pipe(SS),所以必须重新import zh、不能从外部传参进来,否则会有ValueError。泥马这个坑爹bug花了哥一个上午才搞定!泥马坑爹!
    import zh_core_web_sm
    nlp = zh_core_web_sm.load()

    from spacy.pipeline import SentenceSegmenter

    def split_on_punctuation(doc):
        punctuation = ",;,;、和与"
        # punctuation = re.compile(r",.:;?!,。:;?!")
        start = 0
        whether_segmenter = False
        for word in doc:
            if whether_segmenter or word.is_space:  # and not word.is_space!
                yield doc[start:word.i]
                start = word.i
                whether_segmenter = False
            elif word.text in punctuation:
                whether_segmenter = True
        if start < len(doc):
            yield doc[start:len(doc)]

    SS = SentenceSegmenter(nlp.vocab, strategy=split_on_punctuation)
    nlp.add_pipe(SS)
    doc = nlp(s)
    for sent in doc.sents:
        separate._log.debug("\t{}".format(sent.text))
    return doc
예제 #2
0
class Chinese_Weibo_Analyze(object):

    """
    Create a Weibo parser
    """

    nlp = zh_core_web_sm.load()

    def __init__(self, weibo: str):

        """
        :param weibo: A Weibo string
        """
        self.weibo = weibo

    @property
    def parse_chinese(self):
        """
        Get the nouns and verbs from a Weibo string
        :return: nouns list and verb list
        """
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)" \
                r"))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        weibo_without_url = re.sub(regex, '', self.weibo)
        doc = Chinese_Weibo_Analyze.nlp(weibo_without_url)
        nouns_list = list(set([token.lemma_ for token in doc if token.pos_ == "NOUN"]))
        verbs_list = list(set([token.lemma_ for token in doc if token.pos_ == "VERB"]))
        stopwords_list = create_stopwords_set(stopword_path=data_paths.chinese_stopword_path)
        nouns_list_without_stopwords = [word for word in nouns_list if word not in stopwords_list]
        verbs_list_without_stopwords = [word for word in verbs_list if word not in stopwords_list]
        return nouns_list_without_stopwords, verbs_list_without_stopwords
예제 #3
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
        """

        max_length = kwargs.get('max_length', 10**6)
        nlp = spacy.load(self.language,
                         max_length=max_length) if self.language != 'zh' else zh_core_web_sm.load()
        spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                     for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get('input_file', None),
                                      **kwargs)

        return doc
예제 #4
0
def test_zh():
    import zh_core_web_sm
    nlp = zh_core_web_sm.load()
    doc = nlp("我去食堂吃饭")

    print(">>tokens")
    for token in doc:
        print(token)

    print(">>sents")
    print(list(doc.sents))

    print(">>tags")
    for w in doc:
        print(w.pos, w.pos_)

    print(">>ents")
    print(list(doc.ents))
예제 #5
0
def China_No1():
    try:
        import zh_core_web_sm
        nlp = zh_core_web_sm.load()
        China_No1._log.info(
            "The 'zh_core_web_sm' module has been loaded in order to handle Chinese based on SpaCy."
        )
    except (ModuleNotFoundError, IOError) as e1:
        China_No1._log.error(
            "The 'zh_core_web_sm' module cannot be loaded!\n{}".format(e1))
        from spacy.lang.zh import Chinese
        nlp = Chinese()
    except Exception as e2:
        China_No1._log.critical(
            "Neither the 'en_core_web_sm' nor the 'zh_core_web_sm' module can be loaded!\n{}\n"
            .format(e2))

    import jieba
    # SETTING_FILE = const.SETTING_FILE
    current_path = os.path.dirname(os.getcwd()) + '/'
    CONFIG = ConfigFactory(SETTING_FILE).load_config()
    jieba_dict_path = CONFIG.get("lib", "jieba_dict_path")
    customized_jieba_dict = current_path + jieba_dict_path + CONFIG.get(
        "lib", "jieba_dict_file")
    try:
        # 因为zh_core_web_sm已经错误地指定了如何分词,所以只能强行让jieba加载自定义字典。频率越高,成词的概率就越大。
        # https://github.com/fxsjy/jieba/issues/14
        jieba.load_userdict(customized_jieba_dict)
        China_No1._log.info(
            "The customized jieba dictionary '{}' has been loaded.\n".format(
                customized_jieba_dict))
    except Exception as e3:
        China_No1._log.error(
            "The customized jieba dictionary '{}' cannot be loaded!\n{}\n".
            format(customized_jieba_dict, e3))

    return nlp
from nltk.stem.lancaster import LancasterStemmer
import re
import os
import string
import spacy
import zh_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS

import data_paths

punctuations_all = string.punctuation + '。,﹑?!:;“”()《》•……【】'

nlp = zh_core_web_sm.load()
traffic_word_set = {'堵', '拥堵', '车祸', '剐蹭', '事故', '绕行', '追尾', '相撞', '塞车', '路况'}


def create_stopwords_set(stopword_path: str) -> list:
    """
    Create the Chinese stopword list
    :param stopword_path: the path which contains the stopword
    :return: a Chinese stopword list
    """
    stopwords_list = []
    with open(os.path.join(stopword_path, 'hit_stopwords.txt'),
              'r',
              encoding='utf-8') as stopword_file:
        for line in stopword_file:
            line = line.replace("\r", "").replace("\n", "")
            stopwords_list.append(line)
    return stopwords_list
예제 #7
0
import zh_core_web_sm
from language_service.dto.word import Word

parser = zh_core_web_sm.load()


def tag_chinese(text):
    # Lemmatization is not meaningful for Chinese because there are no conjugations.
    return [
        Word(token=word.text, tag=word.pos_, lemma=word.text)
        for word in parser(text)
    ]