Python Dir_File.oper_file示例，util_file.Dir_File.oper_file Python示例

示例#1

0

显示文件

文件： t1_pku_2.py 项目： buptzzl/developer

 def __init__(self, Pdir, words, codes='gbk'):
     ''' 录入文件|文件夹 中的内容到list '''
     assert isinstance(Pdir,str) and len(words)>0, "ERROR: Paramater's type "
     self.dir    = Pdir
     self.list   = []                        # 三级list: file->line->col
     self.files  = []                        # 遍历文件夹时，存放文件列表
     self.freq   = {}                        # 按self.list存放的freq单元{wrod:f}
     self.modify=False                    # list file 数据被修改标识
     #self.sentence = []                      # 两级list,一级list存放分词后的句子
     self.size   = 0                         # 总词数
     self.data_size = 0                      # 外部集合的sum()
     self.PMIwords = list(set(words))        # 计算相似度的词集合
     self.PMIfactors = {}                    # PMI词的分母，未做开方运算
     self.content = {}                       # 词的上下文(先得到list->再有dict),按PMI得特征向量
     self.simi   = {}                          # 目标词的
     self.window = 2                         # 上下文词窗口长度
     self.decomp = True                      # 标志key是否无POS 仅为汉字
     self.codes = codes
     #if len(data)!=0:            return None
     tdir = Dir_File(Pdir)
     if os.path.isdir(Pdir):
         temp = tdir.key_value(model='rb')       # 返回dict，将K-V分别转换为list
         self.files = temp.keys()
         self.list = [ temp[f] for f in self.files ]  
     else:
         self.files = [Pdir]
         self.list = [tdir.oper_file(Pdir)]
     for i in range(len(self.list)):
         l = [ l for l in self.list[i] if len(l)>2 ]    # 过滤list,长度>2
         self.list[i] = l
     # 检查编码是否一致; 要求list中的第一个单元非空
     assert words[0].decode(self.codes) and self.list[0][0][0].decode(self.codes), "init::ERROR: PMIwords or file's code-type different"

示例#2

0

显示文件

文件： golden_result_t1.py 项目： buptzzl/developer

class UtilWork:
    ''' 导入按COAE-gold-task1文件结构导入答案到list '''
    def __init__(self, fpath):
        self.path = fpath
        self.list = []
        self.content = Dir_File(self.path)  # 文件读写
        self.filter = LineOperator()
    def oper_file(self,model='rb'):
        ''' 读取文件，转换为两级list '''
        self.list = self.content.oper_file(self.path,model=model)
        #self.list = self.filter(data=self.list) # 得到两级list
    def oper_dir(self,pdir='',model='rb'):
        ''' 操作目录 '''
        pass
    def para_monitor(self,news=[]):
        pass

示例#3

0

显示文件

文件： t3_pku_1.py 项目： buptzzl/developer

class SegmentFilter:
    ''' 基于单极性n|p的词典（支持长度过滤），2gram匹配分词结果，存放格式[pre1,pre2,term,suf1,suf2]末尾插入Docid '''
    def __init__(self,dict_path,save_pt,line=None,params=None):        
        para={'dict_type':'n','code':'gbk','num':3,'wpos':0,'mpos':1,'len':2,'npos':['n'],'vpos':['v']}    # 默认的初始化参数
        if params: para = update_value(para,params,True)                    # 更新初始化参数
        self.codes = para['code']   # 文件的编码方式        
        self.num = para['num']      # 命中词的前后词数量
        self.wpos = para['wpos']    # list中字的pos
        self.mpos = para['mpos']    # list中POS的pos
        self.len_dict=para['len']   # 字典原子的字节数下限
        self.npos = para['npos']              # 字典命中词的名词POS列表
        self.vpos = para['vpos']              # 动词POS列表
        self.dict_type=para['dict_type']      # 字典的极性
        self.spath = save_pt        # 命中结果的保存文件地址
        self.line = line            # 待分析的句子
        if self.line:   self.__normal()
        self.dict = []              # 字典的内容
        self.hit = []               # 命中结果        
        self.oper = Dir_File(os.path.dirname(save_pt))  # 提供文件操作，目录默认为字典的上级目录        
        if line:        # 编码检测
            self.line[0][wpos].decode(self.codes)        
        if isinstance(dict_path,list):
            fps = [ file(i,'rb') for i in dict_path ]
        else: fps = [ file(dict_path,'rb') ]
        for l in fps:
            tlines = [ one.strip() for one in l.readlines() if len(one.strip())>self.len_dict] # 过滤单字
            l.close()
            assert tlines[0].decode(self.codes),"!!!! init::ERROR try decode failed !!!! "
            self.dict.extend(tlines)
    def update(self,line,save=None):
        if not isinstance(line,list) or len(line)==0:
            print "!!!! update::WARN date's size too small !!!!"
        line[0][self.wpos].decode(self.codes)    # 编码检验
        self.line = line
        self.__normal()
        if save and isinstance(save,str): self.spath = save
    def save(self,path=None,mode='ab'):
        ''' 两级list存储 '''
        if len(self.hit)==0:
            print "!!!! save::WARN hit-data is empty !!!!"
            return
        self.hit.append(['',''])          # 插入空白原子，ab模式写文件时实现换行
        if path:self.oper.oper_file(path,mode,self.hit) 
        else:   self.oper.oper_file(self.spath,mode,self.hit)
        self.hit = []   # 清空数据
    def __normal(self):
        ''' 数据规划化：删除长度<2的单元 '''
        if len(self.line)==0: return False
        self.line = [ l for  l in self.line if len(l)>1 ]
        return True
    def __pos_filter(self,atom):
        ''' 对N,V词集等做过滤 TODO:计算各类别的置信度 '''
        if len(self.npos)==0 or len(self.vpos)==0:
            print "!!!! __pos_filter::ERROR POS-values is un-initialed !!!!"
            return False                             # 不进行POS合法性检验
        if len(atom)>self.mpos and atom[self.mpos] in self.npos or atom[self.mpos] in self.vpos:
            if self.dict_type=="p":                     # 对V|N 过滤掉正极性
                return True
        return False
    def __segment_combine(self,wid,sz,size=6,punc='w',apos='a'):
        ''' 对长度(非字数)<size的词进行前合并|修改POS值， 再进行匹配 '''
        if wid==0 or sz==1 or \
           punc in self.line[wid-1][self.mpos]  or\
           len(self.line[wid][self.wpos])>size: return False
        tword = ''.join([self.line[wid-1][self.wpos],self.line[wid][self.wpos]])
        if tword in self.dict:
            self.line[wid][self.wpos] = tword ; self.line[wid][self.mpos]=apos
            return True
        return False        
    def match(self,punc='w',fname=''):
        ''' 匹配两级list的句子，查找是否有命中 mpos为list中POS的pos；命中时在此处插入fname'''
        if len(self.line)==0:
            print "!!!! match::ERROR data is empty !!!!",fname
            return False
        assert self.__normal(),"!!!! normal::ERROR !!!!"
        sz = len(self.line)
        hit_pos = -1                              # 记录前一次匹配的词id
        for i in range(sz):
            atom = self.line[i][:] #; print atom[self.wpos],atom; return
            if self.__pos_filter(atom): continue # 过滤
            if atom[self.wpos] in self.dict:
                hit_pos = i                             
                self.line[i].append(fname)            # 插入源文件名
                beg = 0
                if i>self.num: beg = i-self.num
                # 搜索标点符号的位置
                tmp = [l for l in range(beg,i) if punc in self.line[l][self.mpos] ]
                if len(tmp)!=0: beg = tmp[-1]+1       # 子句的句首
                end = i+self.num 
                if end>sz: end = sz
                tmp = [l for l in range(i+1,end) if punc in self.line[l][self.mpos] ]
                if len(tmp)!=0: end = tmp[0]          # 子句的句尾
                self.hit.extend(self.line[beg:end]) ;
                #print "hit get one, beg: %d, end: %d" % (beg,end),atom,self.line[beg:end]
                self.hit.append([])         # 插入空行
            # 分词优化： 仅在前2个词都没命中词典时，引入合并词命中, 且仅合并前后最多各一词
            elif i and (i-hit_pos)>2 and self.__segment_combine(i,sz,size=6,punc=punc):
                hit_pos = i  ; print "combine hit ONE: %d, "%i,self.line[i],
                beg = i-2
                if punc in self.line[beg][self.mpos]: tmp = [self.line[i]]
                else:   tmp = [ self.line[beg],self.line[i] ]
                end = i+1
                if end<sz and punc not in self.line[end][self.mpos]: tmp.append(self.line[end])
                self.hit.extend(tmp) 
                self.hit.append([])                 
        print "++++ match::OVER file: %s, hit-result words: %d ++++"%(fname,len(self.hit))
        return True                 
    def multi_match(self,data=[],punc='w',fname=''):
        ''' match() 的多句子版本 '''
        if len(data)==0 :
            print "!!!! multi_match::WARN data is empty !!!!"
            return False
        pos = -1
        for i in range(len(data)):          #  找到第一个非空原子，用于判断是否非3级list
            if len(data[i])!=0:
                pos = i;
                break
        if pos == -1: return False          # 数据为空
        if not isinstance(data[pos][0],list):
            self.line = data[pos:]
        else:                               # 句子级的list降维
            tmp = [ ]
            for l in data[pos:]: tmp.extend(l)
            self.line = tmp
        self.match(punc,fname)