Пример #1
0
 def __init__(self,feature_file_name,feature_dict_file):
     self.id_feature = feature_id_factory(feature_dict_file)
     feature_file = codecs.open(feature_file_name,"r","utf-8")
     self.feature_file_list = feature_file.readlines()
     feature_file.close()
     self.total_line_feature,self.total_feature_without_tags =  self.get_total_line_info()
     self.iii = 0
Пример #2
0
 def __init__(self,model_file_name,feature_file_name,template_file):
     self.model_dict = {}
     self.id_factory = feature_id_factory(feature_file_name)
     self.template_file = template_file
     lines = ()
     try:
         file = codecs.open(model_file_name,"r","utf-8")
         lines = file.readlines()
         file.close()
     except Exception,e:
         print e
         print 'read model file error ,check if it exists'
Пример #3
0
def select_feature(input_file_name,output_file_name,model_dict):
   
    #默认记录前后五步的历史
    history_size = 5
    future_size = history_size
    history_queue = queue(history_size)
    feature_queue = queue(future_size)
    #查看配置文件中的特征值
    unigram = model_dict["U"]
    input_file = codecs.open(input_file_name,"r","utf-8")
    output_file = codecs.open(output_file_name,"w",'utf-8')
    input_data = input_file.readlines()
    #保证进度用
    progress=0
    lens = len(input_data)
    #记录当前预读到了第几行
    #初始化为指向第一个汉字
    idx = 1
    #从这个对象里面进行特征的编码和解码
    id_factory = feature_id_factory()
    #每一行存储一个特征字典
    line_feature_dict = {}
    
    total_line_feature_dict = []
    total_feature_without_tags = []
    #没有状态信息的特征
    feature_without_tags = []
    #全局特征,用来存储特征函数
    total_feature_dict = {}
    enter_another_line = False
    for line in input_data:
        words = line.strip().split()
        #print print_unicode(line)
        #print print_unicode(words)
        if words[0]=='<BOS>':
            idx+=1
            #预读取几行到feature_queue
            enter_another_line = False
            history_queue.clear()
            feature_queue.clear()
            for j in xrange(future_size-1):
                if(idx>=lens): break
               
                cur_words = input_data[idx].strip().split()
                
                #注意,进入此循环,代表一定发生了预读
                idx += 1
                if(cur_words[0] == '<EOS>'):
                    enter_another_line = True   
                    break
                feature_queue.push(input_data[idx-1])
            continue
       
            
        if words[0]=="<EOS>":
            feature_queue.clear()
            history_queue.clear()
            if(len(feature_without_tags) == 0):
                continue
            total_line_feature_dict.append(line_feature_dict)
            total_feature_without_tags.append(feature_without_tags)
            line_feature_dict = {}
            feature_without_tags = []
            
            #当读到此处的时候,idx肯定指向了下一个<BOS> enter_another_line肯定为True
            #所以要idx+1,跳过读取下一个<BOS>
            idx+=1
            continue
          #预读下一行的信息
        if(enter_another_line == False and idx<lens):
            cur_words = input_data[idx].strip().split()
            idx+=1
            if cur_words[0] != '<EOS>' and cur_words[0]!='<BOS>':
                feature_queue.push(input_data[idx-1])
                
            if(cur_words[0] == '<EOS>'):
                enter_another_line = True
        progress+=1
      
        #输出进度,可以调整后面的值,否则变化会太快或者太慢
        if(progress%100000==0):
            print "extract feature complete %d%%" %round((progress*100/lens+1))
        feature_list,total_feature_without_tag =  utils.encode_feature(words,unigram,history_queue,feature_queue,id_factory)
              #!!important
        feature_queue.pop()
        for cur_feature in feature_list:
            if(cur_feature not in line_feature_dict):
                line_feature_dict[cur_feature] = 1
            else:
                line_feature_dict[cur_feature]+=1
            if(cur_feature not in total_feature_dict):
                total_feature_dict[cur_feature] = 1
            else:
                total_feature_dict[cur_feature] += 1
        feature_without_tags.append(total_feature_without_tag)
        history_queue.push(words)
#     print 'writing features to file...'
#     output_file.write(str(len(feature_dict))+"\n")
#     for key in feature_dict:
#         output_file.write(key+"\t"+str(feature_dict[key])+"\n")
    

    print 'writing features into disk ...'
    total_feature_dict_filename = 'output/total_feature_with_tags.txt'
    total_feature_dict_file = codecs.open(total_feature_dict_filename,"w","utf-8")
    no = 0
    for key in total_feature_dict:
        if total_feature_dict[key] >= threshold:
            total_feature_dict_file.write(key+"\t"+str(no)+"\n")
            no+=1
    for i,item in enumerate(total_line_feature_dict):
        output_file.write("<FEA>\n")
        for item in total_feature_without_tags[i]:
            output_file.write(str(item)+"\t")
        output_file.write("\n")
        for key in total_line_feature_dict[i]:
            if(total_feature_dict[key] >= threshold):
                output_file.write(str(key)+"\t"+str(total_line_feature_dict[i][key])+"\n")
        output_file.write("<SEP>\n")
    total_feature_dict_file.close()
    output_file.close()
    id_factory.write_file("output/feature_dict.txt")  
    print 'extract feature complete!!'