def run(): batch_size = 63 epochs = 5000 data_process = DataProcess(use_word2cut=False) model = build_model() documents_length = data_process.get_documents_size(data_process.enc_ids_file, data_process.dec_ids_file) if batch_size > documents_length: print("ERROR--->" + u"语料数据量过少,请再添加一些") return None #自适应学习率 reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, min_lr=1e-6, mode='min') '''monitor: 需要监视的量,val_loss,val_acc patience: 当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练 verbose: 信息展示模式 mode: 'auto','min','max'之一,在min模式训练,如果检测值停止下降则终止训练。在max模式下,当检测值不再上升的时候则停止训练。''' early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=2) model.fit_generator(generator=generate_batch(batch_size=batch_size), steps_per_epoch=int(documents_length / batch_size)+5, \ validation_data=generate_batch(batch_size=batch_size), \ validation_steps=int(documents_length / batch_size)+5,\ epochs=epochs, verbose=1, workers=2, use_multiprocessing=True, callbacks=[reduce_lr,early_stopping]) model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
def run(): batch_size = 63 epochs = 5000 data_process = DataProcess(use_word2cut=False) model = build_model() documents_length = data_process.get_documents_size( data_process.enc_ids_file, data_process.dec_ids_file) if batch_size > documents_length: print("ERROR--->" + u"语料数据量过少,请再添加一些") return None model.fit_generator(generator=generate_batch(batch_size=batch_size), steps_per_epoch=int(documents_length / batch_size), \ epochs=epochs, verbose=1, workers=1) model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
def run(): enc_vec_model = gensim.models.Word2Vec.load(r'model/encoder_vector.m') dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') batch_size = 9 epochs = 30 data_process = DataProcess(use_word2cut=False) documents_length = data_process.get_documents_size( data_process.enc_ids_file, data_process.dec_ids_file) input_length = data_process.enc_input_length output_length = data_process.dec_output_length enc_embedding_length = data_process.enc_embedding_length dec_embedding_length = data_process.dec_embedding_length if batch_size > documents_length: print("ERROR--->" + u"语料数据量过少,请再添加一些") return None if (data_process.hidden_dim < data_process.enc_input_length): print("ERROR--->" + u"隐层神经元数目过少,请再添加一些") return None model = AttentionSeq2Seq(output_dim=dec_embedding_length, hidden_dim=data_process.hidden_dim, output_length=output_length, \ input_shape=(input_length, enc_embedding_length), batch_size=batch_size, depth=data_process.layer_shape) # keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0) model.compile(loss='mse', optimizer='rmsprop') model.fit_generator(generator=generate_batch(batch_size=batch_size, \ encoder_word2vec_model=enc_vec_model, \ decoder_word2vec_model=dec_vec_model, \ encoder_file_path=data_process.enc_ids_padding_file, \ decoder_file_path=data_process.dec_ids_padding_file, \ embedding_shape = (enc_embedding_length, dec_embedding_length)), steps_per_epoch=int(documents_length / batch_size), \ epochs=epochs, verbose=1, workers=1) model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
def run(): if not os.path.exists("data"): os.makedirs("data") if not os.path.exists("model"): os.makedirs("model") print("step-1--->" + u"加载词向量模型" + "--->START") embedding_model = gensim.models.Word2Vec.load( r'model/model_vector_people.m') word_dict = create_useful_words(embedding_model) embedding_size = embedding_model.vector_size print("step-2--->" + u"语料格式转换,加标注生成标准文件" + "--->START") raw_train_file = [corpus_path + os.sep + main_path + os.sep + sub_path \ for main_path in os.listdir(corpus_path) \ for sub_path in os.listdir(corpus_path + os.sep + main_path)] create_label_data(word_dict, raw_train_file) print("step-3--->" + u"按标点符号或是空格存储文件" + "--->START") documents_length = create_documents() print("step-4--->" + u"对语料中的词统计排序生成索引" + "--->START") lexicon, lexicon_reverse = create_lexicon(word_dict) print("step-5--->" + u"对所有的词创建词向量" + "--->START") useful_word_length, embedding_weights = create_embedding( embedding_model, embedding_size, lexicon_reverse) print("step-6--->" + u"生成标注以及索引" + "--->START") label_2_index = create_label_index() label_2_index_length = len(label_2_index) print("step-7--->" + u"将语料中每一句和label进行索引编码" + "--->START") create_matrix(lexicon, label_2_index) print("step-8--->" + u"将语料中每一句和label以最大长度统一长度,不足补零" + "--->START") max_len = maxlen_2d_list() padding_sentences(max_len) print("step-9--->" + u"模型创建" + "--->START") model = bilstm_cnn_crf(max_len, useful_word_length + 2, label_2_index_length, embedding_size, embedding_weights) print("step-10--->" + u"模型训练" + "--->START") if batch_size > documents_length: print("ERROR--->" + u"语料数据量过少,请再添加一些") return None _ = model.fit_generator(generator=generate_batch(batch_size=batch_size, label_class=label_2_index_length), \ steps_per_epoch=int(documents_length / batch_size), \ epochs=epochs, verbose=1, workers=1) print("step-11--->" + u"模型和字典保存" + "--->START") model.save_weights('model/train_model.hdf5') index_2_label = create_index_label() pickle.dump([lexicon, index_2_label], open('model/lexicon.pkl', 'wb')) pickle.dump([ max_len, embedding_size, useful_word_length + 2, label_2_index_length ], open('model/model_params.pkl', 'wb')) print("step-12--->" + u"打印恢复模型的重要参数" + "--->START") print("sequence_max_length: " + str(max_len)) print("embedding size: " + str(embedding_size)) print("useful_word_length: " + str(useful_word_length + 2)) print("label_2_index_length: " + str(label_2_index_length)) print(u"训练完成" + "--->OK")
histogram_freq=0, batch_size=args.batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq="epoch", ) _ = model.fit_generator( generator=generate_batch( trainPath=trainPath, batch_size=args.batch_size, label_class=args.label_2_index_length, ), steps_per_epoch=int(args.documents_length / args.batch_size), epochs=args.epochs, verbose=1, workers=1, callbacks=[checkpoint, tensorBoard, earlyStopping], ) logger.info("step-11--->" + u"模型和字典保存" + "--->START") model.save_weights(trainPath.weights_path) index_2_label = dataPreprocess.create_index_label()