Exemplo n.º 1
0
 def learn(self,path):
     """
     待学习的文件路径
     """
     count = 0
     for line in open(path):
         count = count +1
         line = unicode(line,'utf-8')
         tmp = []
         for word in seg_text(line):
             if len(word) == 1:
                 tmp.append(word)
             else:
                 if len(tmp)>1:
                     new_word=''.join(tmp)
                     if new_word in self.cache:
                         self.cache[new_word] = self.cache[new_word]+1
                     else:
                         self.cache[new_word] = 1
                 del tmp
                 tmp = []
         new_word = ''.join(tmp)
         if new_word:
             if new_word in self.cache:
                 self.cache[new_word] += 1
             else:
                 self.cache[new_word] = 1
         if count%1000 == 0:
             print("count:%d" % count)
Exemplo n.º 2
0
def test_seg():
    seg_words = scseg.seg_text(u'研究生命起源')
    assert seg_words[0] == u'研究'
    assert seg_words[1] == u'生命'
    assert seg_words[2] == u'起源'