def load_parameters(): """load parameters from config file""" with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = ['f1'] use_char_feature = config['model_params']['use_char_feature'] # 加载vocs path_vocs = [] for feature_name in feature_names: path_vocs.append( config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] return feature_names, sep, vocs, max_len, use_char_feature, word_len
def main(): # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name]['path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # 加载数据 # 加载vocs path_vocs = [] for feature_name in feature_names: path_vocs.append(config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载训练数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' data_dict = init_data( path=config['data_params']['path_train'], feature_names=feature_names, sep=sep, vocs=vocs, max_len=config['model_params']['sequence_length'], model='train') # 训练模型 model = SequenceLabelingModel( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], clip=config['model_params']['clip'], path_model=config['model_params']['path_model']) model.fit( data_dict=data_dict, dev_size=config['model_params']['dev_size'])
def main(): # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'] use_char_feature = config['model_params']['use_char_feature'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name]['path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # char embedding shape if use_char_feature: feature_weight_shape_dict['char'] = \ config['model_params']['embed_params']['char']['shape'] conv_filter_len_list = config['model_params']['conv_filter_len_list'] conv_filter_size_list = config['model_params']['conv_filter_size_list'] else: conv_filter_len_list = None conv_filter_size_list = None # 加载数据 # 加载vocs path_vocs = [] if use_char_feature: path_vocs.append(config['data_params']['voc_params']['char']['path']) for feature_name in feature_names: path_vocs.append(config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] data_dict = init_data( path=config['data_params']['path_test'], feature_names=feature_names, sep=sep, vocs=vocs, max_len=max_len, model='test', use_char_feature=use_char_feature, word_len=word_len) # 加载模型 model = SequenceLabelingModel( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, word_length=word_len, path_model=config['model_params']['path_model']) saver = tf.train.Saver() saver.restore(model.sess, config['model_params']['path_model']) # 标记 viterbi_sequences = model.predict(data_dict) # 写入文件 label_voc = dict() for key in vocs[-1]: label_voc[vocs[-1][key]] = key with codecs.open(config['data_params']['path_test'], 'r', encoding='utf-8') as file_r: sentences = file_r.read().strip().split('\n\n') file_result = codecs.open( config['data_params']['path_result'], 'w', encoding='utf-8') for i, sentence in enumerate(sentences): for j, item in enumerate(sentence.split('\n')): if j < len(viterbi_sequences[i]): file_result.write('%s\t%s\n' % (item, label_voc[viterbi_sequences[i][j]])) else: file_result.write('%s\tO\n' % item) file_result.write('\n') file_result.close()
def main(): # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'][:2] for id in range(0, 83): feature_names.append('domain' + str(id)) use_char_feature = config['model_params']['use_char_feature'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names[:2]: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name]['path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) for feature_name in feature_names[2:]: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params']['default']['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params']['default']['dropout_rate'] path_pre_train = config['model_params']['embed_params']['default']['path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # char embedding shape if use_char_feature: feature_weight_shape_dict['char'] = \ config['model_params']['embed_params']['char']['shape'] conv_filter_len_list = config['model_params']['conv_filter_len_list'] conv_filter_size_list = config['model_params']['conv_filter_size_list'] else: conv_filter_len_list = None conv_filter_size_list = None # 加载数据 # 加载vocs path_vocs = [] if use_char_feature: path_vocs.append(config['data_params']['voc_params']['char']['path']) for feature_name in config['model_params']['feature_names']: path_vocs.append(config['data_params']['voc_params'][feature_name]['path']) # for feature_name in feature_names[2:]: # path_vocs.append(config['data_params']['voc_params']['default']['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] session_data_dict = load_session_data_domain(config['data_params']['path_test'], config['model_params']['feature_names'], vocs, max_len,model='test') # print(session_data_dict) # 加载模型 model = SequenceLabelingModel2( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, word_length=word_len, path_model=config['model_params']['path_model']) saver = tf.train.Saver() saver.restore(model.sess, config['model_params']['path_model']) # 标记 infer_results = model.predict(session_data_dict) # print(infer_results) # 写入文件 slot_voc = dict() for key in vocs[-1]: slot_voc[vocs[-1][key]] = key intent_voc = dict() for key in INTENT_DIC: intent_voc[INTENT_DIC[key]] = key with codecs.open(config['data_params']['path_test'], 'r', encoding='utf-8') as file_r: sentences = file_r.readlines() file_result = codecs.open( config['data_params']['path_result'], 'w', encoding='utf-8') infer_sentence_results = [] # for sid, pred_intents in infer_results: # for x in pred_intents: # infer_sentence_results += [(sid, x)] # ['10000', [2], [[1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 5, 1, 1, 1, 1, 1, 22, 59]]] for sid, pred_intents,pre_slots in infer_results: pre_label = [] pre_label.append(pred_intents) for slot in pre_slots: pre_label.append(slot) infer_sentence_results.append((sid, pre_label)) print('predict sentences count', len(infer_sentence_results), len(sentences)) # for i, sentence in enumerate(sentences): # sid, pred_intent = infer_sentence_results[i] # for j, item in enumerate(sentence.split('\n')): # if j == 0: # for key in INTENT_DIC.keys(): # if pred_intent == INTENT_DIC.get(key): # file_result.write('%s\t%s\n' % (item, key)) # continue # else: # file_result.write('%s\tO\n' % item) # file_result.write('\n') infer_session = [] session_temp = [] for ss in sentences: if ss.strip(): session_temp.append(ss.strip('\n')) else: infer_session.append(session_temp) session_temp = [] if session_temp: infer_session.append(session_temp) assert len(infer_session) == len(infer_sentence_results) for sessions,results in zip(infer_session,infer_sentence_results): labels = results[1] # print(len(sessions) , len(labels)) # print(sessions) assert len(sessions) == len(labels) first = True for sess,lab in zip(sessions,labels): if first: file_result.write(sess.strip()+'\t'+intent_voc[lab]+'\n') first = False else: file_result.write(sess.strip() + '\t' + slot_voc[lab] + '\n') file_result.write('\n') file_result.close()
def __init__(self): with open('./config.yml') as file_config: config = yaml.load(file_config) self.max_len = config['model_params']['sequence_length'] feature_names = config['model_params']['feature_names'] use_char_feature = config['model_params']['use_char_feature'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][ feature_name]['path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load( file_r) # char embedding shape if use_char_feature: feature_weight_shape_dict['char'] = \ config['model_params']['embed_params']['char']['shape'] conv_filter_len_list = config['model_params'][ 'conv_filter_len_list'] conv_filter_size_list = config['model_params'][ 'conv_filter_size_list'] else: conv_filter_len_list = None conv_filter_size_list = None # 加载vocs path_vocs = [] if use_char_feature: path_vocs.append( config['data_params']['voc_params']['char']['path']) for feature_name in feature_names: path_vocs.append( config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) self.vocs = load_vocs(path_vocs) # 加载模型 self.model = ClassficationModel( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, word_length=config['model_params']['word_length'], path_model=config['model_params']['path_model']) saver = tf.train.Saver() saver.restore(self.model.sess, config['model_params']['path_model']) self.label_voc = {} for key, value in self.vocs[-1].items(): self.label_voc[value] = key
def main(): # 加载配置文件 print("config5") with open('./train_config/config_b2b_tag_5_only_jieba.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'] use_char_feature = config['model_params']['use_char_feature'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name][ 'path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # char embedding shape if use_char_feature: feature_weight_shape_dict['char'] = \ config['model_params']['embed_params']['char']['shape'] conv_filter_len_list = config['model_params']['conv_filter_len_list'] conv_filter_size_list = config['model_params']['conv_filter_size_list'] else: conv_filter_len_list = None conv_filter_size_list = None # 加载数据 # 加载vocs path_vocs = [] if use_char_feature: path_vocs.append(config['data_params']['voc_params']['char']['path']) for feature_name in feature_names: path_vocs.append( config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载训练数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] data_dict = init_data(path=config['data_params']['path_train'], feature_names=feature_names, sep=sep, vocs=vocs, max_len=max_len, model='train', use_char_feature=use_char_feature, word_len=word_len) # 训练模型 model = SequenceLabelingModel( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], clip=config['model_params']['clip'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, cnn_dropout_rate=config['model_params']['conv_dropout'], word_length=word_len, path_model=config['model_params']['path_model']) model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size'])
feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name][ 'path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # 加载vocs path_vocs = [] for feature_name in feature_names: path_vocs.append(config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) print(vocs[-1]) print(len(vocs)) # 加载模型 model = SequenceLabelingModel( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'],
def main(): # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'][:2] for id in range(0, 83): feature_names.append('domain' + str(id)) use_char_feature = config['model_params']['use_char_feature'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) """ f1:字 f2:词性 f3:槽名称 """ feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names[:2]: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name][ 'path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) for feature_name in feature_names[2:]: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params']['default']['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params']['default']['dropout_rate'] path_pre_train = config['model_params']['embed_params']['default'][ 'path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # char embedding shape if use_char_feature: feature_weight_shape_dict['char'] = \ config['model_params']['embed_params']['char']['shape'] conv_filter_len_list = config['model_params']['conv_filter_len_list'] conv_filter_size_list = config['model_params']['conv_filter_size_list'] else: conv_filter_len_list = None conv_filter_size_list = None # 加载数据 # 加载vocs path_vocs = [] if use_char_feature: path_vocs.append(config['data_params']['voc_params']['char']['path']) for feature_name in config['model_params']['feature_names']: path_vocs.append( config['data_params']['voc_params'][feature_name]['path']) # for feature_name in feature_names[2:]: # path_vocs.append(config['data_params']['voc_params']['default']['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载训练数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] nb_classes = config['model_params']['nb_classes'] session_data_dict = load_session_data_domain( config['data_params']['path_train'], config['model_params']['feature_names'], vocs, max_len) print(len(feature_name)) model = SequenceLabelingModel2( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], clip=config['model_params']['clip'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, cnn_dropout_rate=config['model_params']['conv_dropout'], word_length=word_len, path_model=config['model_params']['path_model']) model.fit(data_dict=session_data_dict, dev_size=config['model_params']['dev_size'])
def main(): # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'] logger.info(feature_names) use_char_feature = config['model_params']['use_char_feature'] logger.info(use_char_feature) # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict = dict() feature_weight_dropout_dict = dict() feature_init_weight_dict = dict() for feature_name in feature_names: feature_weight_shape_dict[feature_name] = config['model_params'][ 'embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = config['model_params'][ 'embed_params'][feature_name]['dropout_rate'] # embeding mat, 比voc多了两行, 因为voc从2开始编序, 0, 1行用0填充 path_pre_train = config['model_params']['embed_params'][feature_name][ 'path'] # 词嵌矩阵位置 # logger.info("%s init mat path: %s" % (feature_name, path_pre_train)) with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) logger.info(feature_weight_dropout_dict) logger.info(feature_weight_shape_dict) logger.info(feature_init_weight_dict) # char embedding shape if use_char_feature: # 暂时不考虑 feature_weight_shape_dict['char'] = config['model_params'][ 'embed_params']['char']['shape'] conv_filter_len_list = config['model_params']['conv_filter_len_list'] conv_filter_size_list = config['model_params']['conv_filter_size_list'] else: # 利用卷集层来提取char的信息 conv_filter_len_list = None conv_filter_size_list = None # 加载vocs path_vocs = [] if use_char_feature: path_vocs.append(config['data_params']['voc_params']['char'] ['path']) # vocs用于将文本数字序列化 for feature_name in feature_names: path_vocs.append( config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载训练数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] # 数据的分隔方式 sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] # 通过voc 将input f1 和输出 label 数字序列化 得到训练的输入和输出 # data_dict = None data_dict = init_data(path=config['data_params']['path_train'], feature_names=feature_names, sep=sep, vocs=vocs, max_len=max_len, model='train', use_char_feature=use_char_feature, word_len=word_len) logger.info(data_dict) # 每个特征序列化后的数据 # 训练模型 model = SequenceLabelingModel( sequence_length=config['model_params']['sequence_length'], # 句子被固定长度 nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], clip=config['model_params']['clip'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, cnn_dropout_rate=config['model_params']['conv_dropout'], word_length=word_len, path_model=config['model_params']['path_model'], last_train_sess_path=None, # 为了加快训练的速度我们继续载入前面训练的参数 transfer=False) # 是否对前面载入的参数进行迁移学习,True的话就重置LSTM的输出层 model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size']) """
def predict(testlist): # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'] use_char_feature = config['model_params']['use_char_feature'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name][ 'path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # char embedding shape if use_char_feature: feature_weight_shape_dict['char'] = \ config['model_params']['embed_params']['char']['shape'] conv_filter_len_list = config['model_params']['conv_filter_len_list'] conv_filter_size_list = config['model_params']['conv_filter_size_list'] else: conv_filter_len_list = None conv_filter_size_list = None # 加载vocs path_vocs = [] if use_char_feature: path_vocs.append(config['data_params']['voc_params']['char']['path']) for feature_name in feature_names: path_vocs.append( config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] data_dict = init_data(path=config['data_params']['path_test'], feature_names=feature_names, sep=sep, test_sens=testlist, vocs=vocs, max_len=max_len, model='test', use_char_feature=use_char_feature, word_len=word_len) # 加载模型 model = SequenceLabelingModel( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, word_length=word_len, path_model=config['model_params']['path_model']) saver = tf.train.Saver() saver.restore(model.sess, config['model_params']['path_model']) # print('data_dict', data_dict) # 标记 result_sequences = model.predict(data_dict) #print('result_sequences', result_sequences) # 输出结果 label_voc = dict() for key in vocs[-1]: label_voc[vocs[-1][key]] = key outlist = [] for i, sentence in enumerate(testlist): templist = [] for j, item in enumerate(sentence): #char = recheck_char(item[0]) char = item[0] if j < len(result_sequences[i]): out = [char, label_voc[result_sequences[i][j]]] else: out = [char, 'O'] templist.append(out) outlist.append(templist) return outlist
def export_serving_model(): """输出tensorserving model""" with open('./config.yml') as file_config: config = yaml.load(file_config) feature_names = config['model_params']['feature_names'] use_char_feature = config['model_params']['use_char_feature'] # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化) feature_weight_shape_dict, feature_weight_dropout_dict, \ feature_init_weight_dict = dict(), dict(), dict() for feature_name in feature_names: feature_weight_shape_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['shape'] feature_weight_dropout_dict[feature_name] = \ config['model_params']['embed_params'][feature_name]['dropout_rate'] path_pre_train = config['model_params']['embed_params'][feature_name][ 'path'] if path_pre_train: with open(path_pre_train, 'rb') as file_r: feature_init_weight_dict[feature_name] = pickle.load(file_r) # char embedding shape if use_char_feature: feature_weight_shape_dict['char'] = \ config['model_params']['embed_params']['char']['shape'] conv_filter_len_list = config['model_params']['conv_filter_len_list'] conv_filter_size_list = config['model_params']['conv_filter_size_list'] else: conv_filter_len_list = None conv_filter_size_list = None # 加载vocs path_vocs = [] for feature_name in feature_names: path_vocs.append( config['data_params']['voc_params'][feature_name]['path']) path_vocs.append(config['data_params']['voc_params']['label']['path']) vocs = load_vocs(path_vocs) # 加载数据 sep_str = config['data_params']['sep'] assert sep_str in ['table', 'space'] sep = '\t' if sep_str == 'table' else ' ' max_len = config['model_params']['sequence_length'] word_len = config['model_params']['word_length'] model = SequenceLabelingModel( sequence_length=config['model_params']['sequence_length'], nb_classes=config['model_params']['nb_classes'], nb_hidden=config['model_params']['bilstm_params']['num_units'], num_layers=config['model_params']['bilstm_params']['num_layers'], feature_weight_shape_dict=feature_weight_shape_dict, feature_init_weight_dict=feature_init_weight_dict, feature_weight_dropout_dict=feature_weight_dropout_dict, dropout_rate=config['model_params']['dropout_rate'], nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names, batch_size=config['model_params']['batch_size'], train_max_patience=config['model_params']['max_patience'], use_crf=config['model_params']['use_crf'], l2_rate=config['model_params']['l2_rate'], rnn_unit=config['model_params']['rnn_unit'], learning_rate=config['model_params']['learning_rate'], use_char_feature=use_char_feature, conv_filter_size_list=conv_filter_size_list, conv_filter_len_list=conv_filter_len_list, word_length=word_len, path_model=config['model_params']['path_model']) session = model.sess saver = tf.train.Saver() saver.restore(session, config['model_params']['path_model']) # 输出tensorserving model 过程 model_version = 1 work_dir = './Model/ner_model' export_path_base = work_dir export_path = os.path.join(tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(model_version))) print('Exporting trained model to', export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) # 定义输入变量 tensor_info_input_x_f1 = tf.saved_model.utils.build_tensor_info( model.input_feature_ph_dict['f1']) tensor_info_weight_dropout_ph_dict_f1 = tf.saved_model.utils.build_tensor_info( model.weight_dropout_ph_dict['f1']) tensor_info_dropout_rate_ph = tf.saved_model.utils.build_tensor_info( model.dropout_rate_ph) tensor_info_rnn_dropout_rate_ph = tf.saved_model.utils.build_tensor_info( model.rnn_dropout_rate_ph) tensor_info_logits = tf.saved_model.utils.build_tensor_info(model.logits) tensor_info_actual_length = tf.saved_model.utils.build_tensor_info( model.sequence_actual_length) tensor_info_transition_params = tf.saved_model.utils.build_tensor_info( model.transition_params) # 构建过程 prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ 'input_x_f1': tensor_info_input_x_f1, 'weight_dropout_ph_dict_f1': tensor_info_weight_dropout_ph_dict_f1, 'dropout_rate_ph': tensor_info_dropout_rate_ph, 'rnn_dropout_rate_ph': tensor_info_rnn_dropout_rate_ph }, outputs={ 'transition_params': tensor_info_transition_params, 'logits': tensor_info_logits, 'sequence_actual_length': tensor_info_actual_length, }, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) ) builder.add_meta_graph_and_variables( session, [tf.saved_model.tag_constants.SERVING], signature_def_map={'ner_predict': prediction_signature}, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save() print('Done exporting!')