def count(self, decomp=True, sep='/', pos=0): ''' 统计list各个原子的freq; 返回一级list; 默认指定decomp:简化键为汉字 ''' self.decomp = decomp # 记录key的索引特征 temp = simplify(self.list) # list结构降维至词一级 cnt = 0 # 数量校验; @ADD:3/26 当decomp=True时,可能出现异常 self.size = len(temp) for w in temp: if w not in self.freq.keys(): # 仅统计一次 key = w if decomp: try: key = w.split(sep)[pos] except Exception,err_info: print 'Key decomposition Error',err_info,w,key self.freq[key]=temp.count(w) # 使用list的count() cnt += int(self.freq[key]) print "coun::CHECK: \nlist_size:%d =? count()_size:%d" % (len(temp),cnt) return
def context(self, decomp=True): ''' 统计目标词的上下词序列|不含POS,含目标词 ''' if len(self.PMIwords)==0 : return None pmiw = set(self.PMIwords) # ([ w for w in self.PMIwords if w in self.freq.keys() ]) for w in pmiw: # 初始化 self.content[w]=[] sentences = simplify(self.list,True,1) # 将文档集降维至句子集 if decomp: # 文件中有POS,需要清洗 for i in range(len(sentences)): # 清洗掉POS部分,仅保留汉字 l = sentences[i] if len(l)>0: sentences[i] = [ w.split('/')[0] for w in l if w.find('/')!=-1 ] #return sentences for line in sentences: for w in set(line).intersection(pmiw): # 交集 wid = line.index(w) beg = wid-self.window if beg<0 : beg = 0 end = wid+self.window+1 # +1:不包含词本身 if end>len(line) : end = len(line) self.content[w].extend(line[beg:end]) # 加入上下文;统一放在置于一个list中 return None