示例#1
0
def save(para={},match=False):
    ''' 保存结果,match指示是否直接匹配字典 '''
    src_fp = r'../COAE2011_Corpus_All_Text/'     # r'./test' # 
    kind = ['seg4_fin', 'seg4_ent', 'seg4_dig']  #  ['temp'] #
    suf_save = '_pku3'                           # 新目录的后缀
    union_format = lambda x:x                    # 默认不做格式转换
    if len(para)!=0:
        kind = ['ict_fin','ict_ent','ict_dig']   # 使用ict文件源
        suf_save = '_pku3_ict'
        union_format = format_rev
    pos_N = ['ns','nz','nh','ni','n','j','ws','nl','nt']
    pos_A = ['a','b']
    for f in kind:
        src_p = os.path.join(src_fp, f)        
        odif = Dir_File(src_p)         
        sentences = odif.key_value(model='rb')      #; return sentences        
        data = {}   #; return sentences
        # print '==== len: %d ====' % len(sentences), sentences
        # TODO 优化类 PatternDrag 的代码,将其实例的说明提到for循环外,循环增加数据时,再处理增加的内容
        for k in sentences.keys():
            v = union_format(sentences[k])           #;    return v
            # print '==== src_fPath: %s/%s, lines-sum: %d' % (src_p,k,len(v))   
            emotion = PatternDrag(v, pos_N, pos_A)
            emotion.monitor_params(para)      # 修改参数
            tmp = emotion.emotion_pattern()   #; return tmp
            data[k] = emotion.format_emo( tmp )  #; return data[k]
        dst_p = os.path.join(src_fp,''.join([f,suf_save]))       # 一个类别的结果存放一个文件
        print "==== save():: write back data, path: %s,\n==== data-size: %d" % (dst_p,len(data))
        odif.key_value(data, 'wb', dst_p)
    return
示例#2
0
 def __init__(self, Pdir, words, codes='gbk'):
     ''' 录入文件|文件夹 中的内容到list '''
     assert isinstance(Pdir,str) and len(words)>0, "ERROR: Paramater's type "
     self.dir    = Pdir
     self.list   = []                        # 三级list: file->line->col
     self.files  = []                        # 遍历文件夹时,存放文件列表
     self.freq   = {}                        # 按self.list存放的freq单元{wrod:f}
     self.modify=False                    # list file 数据被修改标识
     #self.sentence = []                      # 两级list,一级list存放分词后的句子
     self.size   = 0                         # 总词数
     self.data_size = 0                      # 外部集合的sum()
     self.PMIwords = list(set(words))        # 计算相似度的词集合
     self.PMIfactors = {}                    # PMI词的分母,未做开方运算
     self.content = {}                       # 词的上下文(先得到list->再有dict),按PMI得特征向量
     self.simi   = {}                          # 目标词的
     self.window = 2                         # 上下文词窗口长度
     self.decomp = True                      # 标志key是否无POS 仅为汉字
     self.codes = codes
     #if len(data)!=0:            return None
     tdir = Dir_File(Pdir)
     if os.path.isdir(Pdir):
         temp = tdir.key_value(model='rb')       # 返回dict,将K-V分别转换为list
         self.files = temp.keys()
         self.list = [ temp[f] for f in self.files ]  
     else:
         self.files = [Pdir]
         self.list = [tdir.oper_file(Pdir)]
     for i in range(len(self.list)):
         l = [ l for l in self.list[i] if len(l)>2 ]    # 过滤list,长度>2
         self.list[i] = l
     # 检查编码是否一致; 要求list中的第一个单元非空
     assert words[0].decode(self.codes) and self.list[0][0][0].decode(self.codes), "init::ERROR: PMIwords or file's code-type different" 
示例#3
0
def save(ictF=False):
    ''' 直接匹配字典,保存结果 '''
    from util_file import Dir_File
    from t3_pku import format_rev
    # 装载词典; 设置存储文件名
    rneg = [r'./test/remark_neg.txt',r'./test/feel_neg.txt'] ;
    wneg = 'hit_neg.txt'; 
    rpos = [r'./test/remark_pos.txt',r'./test/feel_pos.txt'] ;
    wpos = 'hit_pos.txt';
    para={'code':'gbk','num':3,'wpos':0,'mpos':1,'len':2,'dict_type':'',
          'npos':['n','nd','nh','ni','nl','ns','nt','nz'],'vpos':['v']}
    mpos=1; punc='w';  # multi_match()函数的参数
    src_fp = r'../COAE2011_Corpus_All_Text/'     # r'./test' # 
    kind = ['seg4_fin', 'seg4_ent', 'seg4_dig']  #  ['temp'] #
    union_format = lambda x:x                    # 默认不做格式转换
    if ictF:
        kind = ['ict_dig','ict_fin','ict_ent']   # 使用ict文件源
        wneg = 'hit_neg_hit.txt'
        wpos = 'hit_pos_hit.txt'
        para['npos']=['n','nr','nr1','nr2','nrj','nrf','ns','nsf','nt','nz','nl','ng']
        para['vpos']=['v','vn','vf','vx','vi','vl','vg']
        union_format = format_rev 
    #pos_N = ['ns','nz','nh','ni','n','j','ws','nl','nt']
    #pos_A = ['a','b']
    for f in kind:
        src_p = os.path.join(src_fp, f)
        saveN = os.path.join(src_fp,'_'.join([f,wneg]))
        saveP = os.path.join(src_fp,'_'.join([f,wpos])) #; print src_p, saveN, saveP; return
        para['dict_type']='n'
        m_neg = SegmentFilter(rneg,saveN,None,para)
        para['dict_type']='p'
        m_pos = SegmentFilter(rpos,saveP,None,para)        
        odif = Dir_File(src_p)         
        sentences = odif.key_value(model='rb')       #; return sentences        
        data = {}   #; return sentences
        for k in sentences.keys():
            v = union_format(sentences[k])  #; print k         #;    return v
            m_neg.multi_match(v,punc,fname=k)
            m_pos.multi_match(v,punc,fname=k)   ;
            #if k=="D08934.txt":return [m_pos,sentences[k]]
            m_neg.save()
            print "---- save: %s,data-size: %d ----"%(m_pos.spath,len(m_pos.hit))
            m_pos.save()