Пример #1
0
    def __call__(self, input):
        def clear_tag(strin):

            tmp = strin.split()
            rs = []
            for tmpu in tmp:
                if tmpu:
                    ind = tmpu.rfind("/")
                    if ind > 0:
                        rs.append(tmpu[:ind])
                    else:
                        rs.append(tmpu)

            return " ".join(rs)

        if not isinstance(input, (list, tuple)):
            input = [input]

        rs = []
        for inputu in input:
            try:
                _tmp = nlpir.ParagraphProcess(inputu.encode("utf-8", "ignore"),
                                              1)
            except:
                _tmp = ""
            rs.append(clear_tag(_tmp.decode("utf-8", "ignore")))

        return rs
Пример #2
0
    def split_words(s):
        # 繁体转简体
        s = SplitWords.__convert(s)

        # 去除标签
        s = SplitWords.__del_non_tag(s)

        # 去除标点符号
        s = SplitWords.__del_punctuation(s)

        # 去除数字
        s = SplitWords.__del_digit(s)

        # 分词
        words = nlpir.ParagraphProcess(s, True)

        # 去掉左右两边多余的空格,并分割
        words = words.strip().split(" ")

        # 去掉中文停用词
        # 不管分词后的结果是否带有词性
        words = SplitWords.__del_stop(words, SplitWords.__read_chinese_stoplist())
        # 此方法只能是分词后不带词性才可以使用
        # words = [word for word in words if word not in SplitWords.__read_chinese_stoplist()]

        # 去掉英文停用词
        words = SplitWords.__del_stop(words, SplitWords.__read_english_stoplist())

        # 去掉多余的空格
        words = SplitWords.__del_blank(words)

        # 去掉无用的词性词汇,并将剩下的词汇的词性删除
        words = SplitWords.__del_non_pos(words)

        return words
Пример #3
0
def pos_tag(data):
    nlpir.Init(nlpir.PACKAGE_DIR.encode('UTF-8'), nlpir.UTF8_CODE, None)
    new = []
    for line in data:
        s = line.encode('utf-8')
        new_line = nlpir.ParagraphProcess(s, True).decode('utf-8')
        new.append(new_line)
    return new
Пример #4
0
def segline(strin):
    try:
        rs = nlpir.ParagraphProcess(strin.encode("utf-8", "ignore"), 0)
    except:
        rs = ""
    return rs.decode("utf-8", "ignore")
Пример #5
0
#!/usr/bin/python
#-*- coding:utf-8 -*-
import sys
import os
import re
############################
#File Name: segment.py
#Author: weitao
#Mail: [email protected]
#Created Time: 2017-05-20 12:38:33
#Description:
############################

from pynlpir import nlpir
nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE, None)
nlpir.SetPOSmap(nlpir.ICT_POS_MAP_FIRST)

#pynlpir.SetPOSmap(pos_map)
#pynlpir.pos_map
#sys.exit()

for line in sys.stdin:
    content = line.strip().split(",", 1)[1]
    label = line.strip().split(",")[0]
    items = nlpir.ParagraphProcess(line.strip(), 1)
    print label + "," + items
Пример #6
0
 def ParagraphProcess(self, strs):
     nlpir.ParagraphProcess(strs, False)