예제 #1
0
def parse_data(text):
    words = segmenter.seg(text)
    words_p = ""
    context = {}
    # POS Tagging
    tagging = tagger.predict(words)
    for (w, t) in tagging:
        context[w] = t
    #tmp = text.encode("utf-8") + " : ".encode("utf-8") + words_p.encode("utf-8")
    return context
예제 #2
0
    def analyze(self, string):
        '''Return a list of three string output: segment, pos, ner'''
        res = []
        #segment
        words = segmenter.seg(string)
        segment_str = " ".join(words)
        res.append(segment_str)

        #POS
        pos_tagging = self.tag_pos(words)
        res.append(_concat_tuples(pos_tagging))

        #NER
        ner_tagging = self.tag_ner(words)
        res.append(_concat_tuples(ner_tagging))
        return res
#coding:utf-8
from __future__ import unicode_literals  # compatible with python3 unicode

from deepnlp import segmenter
from deepnlp import pos_tagger
tagger = pos_tagger.load_model(lang='zh')

#Segmentation
text = "我爱吃北京烤鸭"  # unicode coding, py2 and py3 compatible
words = segmenter.seg(text)
print(" ".join(words).encode('utf-8'))

#POS Tagging
tagging = tagger.predict(words)
for (w, t) in tagging:
    str = w + "/" + t
    print(str.encode('utf-8'))

#Results
#我/r
#爱/v
#吃/v
#北京/ns
#烤鸭/n
#coding=utf-8
from __future__ import unicode_literals

from deepnlp import segmenter

text = "我刚刚在浙江卫视看了电视剧老九门,觉得陈伟霆很帅"
segList = segmenter.seg(text)
text_seg = " ".join(segList)

print(text.encode('utf-8'))
print(text_seg.encode('utf-8'))
def _concat_tuples(tagging):
  TOKEN_BLANK = " "
  wl = [] # wordlist
  for (x, y) in tagging:
    wl.append(x + "/" + y)
  concat_str = TOKEN_BLANK.join(wl)
  return concat_str

# read input file
docs = []
file = codecs.open(os.path.join(BASE_DIR, 'docs_test.txt'), 'r', encoding='utf-8')
for line in file:
    line = line.replace("\n", "").replace("\r", "")
    docs.append(line)

# Test each individual module
# output file
fileOut = codecs.open(os.path.join(BASE_DIR, 'modules_test_results.txt'), 'w', encoding='utf-8')
words = segmenter.seg(docs[0])
pos_tagging = _concat_tuples(tagger_pos.predict(words))
ner_tagging = _concat_tuples(tagger_ner.predict(words))

fileOut.writelines(" ".join(words) + "\n")
fileOut.writelines(pos_tagging + "\n")
fileOut.writelines(ner_tagging + "\n")
fileOut.close

print (" ".join(words).encode('utf-8'))
print (pos_tagging.encode('utf-8'))
print (ner_tagging.encode('utf-8'))
예제 #6
0
 def segment(self, string):
     ''' Return list of [word]'''
     words = segmenter.seg(string)
     return words