Пример #1
0
 def count(self, decomp=True, sep='/', pos=0):
     ''' 统计list各个原子的freq; 返回一级list; 默认指定decomp:简化键为汉字 '''
     self.decomp = decomp                        # 记录key的索引特征
     temp = simplify(self.list)                  # list结构降维至词一级
     cnt = 0                                     # 数量校验; @ADD:3/26 当decomp=True时,可能出现异常
     self.size = len(temp)
     for w in temp:
         if w not in self.freq.keys():           # 仅统计一次
             key = w                
             if decomp:
                 try:                        key = w.split(sep)[pos]
                 except Exception,err_info:  print 'Key decomposition Error',err_info,w,key
             self.freq[key]=temp.count(w)                       # 使用list的count()
             cnt += int(self.freq[key])
     print "coun::CHECK: \nlist_size:%d =? count()_size:%d" % (len(temp),cnt)
     return 
Пример #2
0
 def context(self, decomp=True):
     ''' 统计目标词的上下词序列|不含POS,含目标词 '''
     if len(self.PMIwords)==0 : return None
     pmiw = set(self.PMIwords)  #   ([ w for w in self.PMIwords if w in self.freq.keys() ])
     for w in pmiw:             # 初始化
         self.content[w]=[]
     sentences = simplify(self.list,True,1)          # 将文档集降维至句子集
     if decomp:                 # 文件中有POS,需要清洗
         for i in range(len(sentences)):             # 清洗掉POS部分,仅保留汉字
             l = sentences[i]
             if len(l)>0:
                 sentences[i] = [ w.split('/')[0] for w in l if w.find('/')!=-1 ]
     #return sentences                            
     for line in sentences:
         for w in set(line).intersection(pmiw):      # 交集
             wid = line.index(w)
             beg = wid-self.window
             if beg<0 : beg = 0
             end = wid+self.window+1                 # +1:不包含词本身
             if end>len(line) : end = len(line)
             self.content[w].extend(line[beg:end])   # 加入上下文;统一放在置于一个list中
     return None