def caculate_wer(train_data,sentence): logging.debug('=' * 10 + 'WER' + '=' * 10) min_score_list = [] for label,group in train_data.groupby(by=['LABEL']): logging.debug( '-'*20) logging.debug(u'正在计算类别(%s),个数:%d'%(label,len(group))) min_score = 1000 min_reference = None for reference in group['SEGMENT_EVERYWORD']: wer_score = wer(reference.split('|'),sentence.split('|')) if wer_score < min_score: min_score = wer_score min_reference = reference # logging.debug('%f,'%(wer_score)+reference) logging.debug(u'最小错误率:%f,句子是:%s'%(min_score,min_reference)) min_score_list.append((label,min_score,min_reference)) min_score_list = sorted(min_score_list,key=lambda x:x[1]) most_similary_label = min_score_list[0][0] most_similary_score = min_score_list[0][1] most_similary_sentence = min_score_list[0][2] logging.debug( '-'*20) logging.debug(u'最近似句子:%s(%s),分数(%f)'%(most_similary_sentence, most_similary_label,most_similary_score)) return most_similary_label,min_score_list
def caculate_wer(train_data,sentence,k=3): ''' 计算sentence跟训练库中所有句子的相似性,然后使用前k个最相似的实例进行投票 :param train_data: :param sentence: :param k: :param k: :return: ''' logging.debug('=' * 10 + 'WER' + '=' * 10) # 创建一个列表存放每个实例的分数 get_wer_score = lambda x: wer(x.split('|'), sentence.split('|')) sentence_wer_score_list = train_data[config['train_segment_column']].apply(get_wer_score).as_matrix() # print sentence_wer_score_list # 排序 sorted_index = np.argsort(sentence_wer_score_list) sorted_index = sorted_index[0:k] # print sorted_index most_similary_score = sentence_wer_score_list[sorted_index] most_similary_sentence = train_data['SEGMENT'].iloc[sorted_index].as_matrix() most_similary_label = train_data['LABEL'].iloc[sorted_index].as_matrix() logging.debug(u'前k个最相似的句子为:%s'%(' , '.join(most_similary_sentence))) logging.debug('分数分别为:%s'%most_similary_score) logging.debug(u'类别分别为:%s'%(' , '.join(most_similary_label))) return most_similary_score.tolist(),most_similary_sentence.tolist(),most_similary_label.tolist()