def save(para={},match=False): ''' 保存结果,match指示是否直接匹配字典 ''' src_fp = r'../COAE2011_Corpus_All_Text/' # r'./test' # kind = ['seg4_fin', 'seg4_ent', 'seg4_dig'] # ['temp'] # suf_save = '_pku3' # 新目录的后缀 union_format = lambda x:x # 默认不做格式转换 if len(para)!=0: kind = ['ict_fin','ict_ent','ict_dig'] # 使用ict文件源 suf_save = '_pku3_ict' union_format = format_rev pos_N = ['ns','nz','nh','ni','n','j','ws','nl','nt'] pos_A = ['a','b'] for f in kind: src_p = os.path.join(src_fp, f) odif = Dir_File(src_p) sentences = odif.key_value(model='rb') #; return sentences data = {} #; return sentences # print '==== len: %d ====' % len(sentences), sentences # TODO 优化类 PatternDrag 的代码,将其实例的说明提到for循环外,循环增加数据时,再处理增加的内容 for k in sentences.keys(): v = union_format(sentences[k]) #; return v # print '==== src_fPath: %s/%s, lines-sum: %d' % (src_p,k,len(v)) emotion = PatternDrag(v, pos_N, pos_A) emotion.monitor_params(para) # 修改参数 tmp = emotion.emotion_pattern() #; return tmp data[k] = emotion.format_emo( tmp ) #; return data[k] dst_p = os.path.join(src_fp,''.join([f,suf_save])) # 一个类别的结果存放一个文件 print "==== save():: write back data, path: %s,\n==== data-size: %d" % (dst_p,len(data)) odif.key_value(data, 'wb', dst_p) return
def __init__(self, Pdir, words, codes='gbk'): ''' 录入文件|文件夹 中的内容到list ''' assert isinstance(Pdir,str) and len(words)>0, "ERROR: Paramater's type " self.dir = Pdir self.list = [] # 三级list: file->line->col self.files = [] # 遍历文件夹时,存放文件列表 self.freq = {} # 按self.list存放的freq单元{wrod:f} self.modify=False # list file 数据被修改标识 #self.sentence = [] # 两级list,一级list存放分词后的句子 self.size = 0 # 总词数 self.data_size = 0 # 外部集合的sum() self.PMIwords = list(set(words)) # 计算相似度的词集合 self.PMIfactors = {} # PMI词的分母,未做开方运算 self.content = {} # 词的上下文(先得到list->再有dict),按PMI得特征向量 self.simi = {} # 目标词的 self.window = 2 # 上下文词窗口长度 self.decomp = True # 标志key是否无POS 仅为汉字 self.codes = codes #if len(data)!=0: return None tdir = Dir_File(Pdir) if os.path.isdir(Pdir): temp = tdir.key_value(model='rb') # 返回dict,将K-V分别转换为list self.files = temp.keys() self.list = [ temp[f] for f in self.files ] else: self.files = [Pdir] self.list = [tdir.oper_file(Pdir)] for i in range(len(self.list)): l = [ l for l in self.list[i] if len(l)>2 ] # 过滤list,长度>2 self.list[i] = l # 检查编码是否一致; 要求list中的第一个单元非空 assert words[0].decode(self.codes) and self.list[0][0][0].decode(self.codes), "init::ERROR: PMIwords or file's code-type different"
def save(ictF=False): ''' 直接匹配字典,保存结果 ''' from util_file import Dir_File from t3_pku import format_rev # 装载词典; 设置存储文件名 rneg = [r'./test/remark_neg.txt',r'./test/feel_neg.txt'] ; wneg = 'hit_neg.txt'; rpos = [r'./test/remark_pos.txt',r'./test/feel_pos.txt'] ; wpos = 'hit_pos.txt'; para={'code':'gbk','num':3,'wpos':0,'mpos':1,'len':2,'dict_type':'', 'npos':['n','nd','nh','ni','nl','ns','nt','nz'],'vpos':['v']} mpos=1; punc='w'; # multi_match()函数的参数 src_fp = r'../COAE2011_Corpus_All_Text/' # r'./test' # kind = ['seg4_fin', 'seg4_ent', 'seg4_dig'] # ['temp'] # union_format = lambda x:x # 默认不做格式转换 if ictF: kind = ['ict_dig','ict_fin','ict_ent'] # 使用ict文件源 wneg = 'hit_neg_hit.txt' wpos = 'hit_pos_hit.txt' para['npos']=['n','nr','nr1','nr2','nrj','nrf','ns','nsf','nt','nz','nl','ng'] para['vpos']=['v','vn','vf','vx','vi','vl','vg'] union_format = format_rev #pos_N = ['ns','nz','nh','ni','n','j','ws','nl','nt'] #pos_A = ['a','b'] for f in kind: src_p = os.path.join(src_fp, f) saveN = os.path.join(src_fp,'_'.join([f,wneg])) saveP = os.path.join(src_fp,'_'.join([f,wpos])) #; print src_p, saveN, saveP; return para['dict_type']='n' m_neg = SegmentFilter(rneg,saveN,None,para) para['dict_type']='p' m_pos = SegmentFilter(rpos,saveP,None,para) odif = Dir_File(src_p) sentences = odif.key_value(model='rb') #; return sentences data = {} #; return sentences for k in sentences.keys(): v = union_format(sentences[k]) #; print k #; return v m_neg.multi_match(v,punc,fname=k) m_pos.multi_match(v,punc,fname=k) ; #if k=="D08934.txt":return [m_pos,sentences[k]] m_neg.save() print "---- save: %s,data-size: %d ----"%(m_pos.spath,len(m_pos.hit)) m_pos.save()