def do_rnn(x,y): global max_document_length print "RNN" trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0) y_test=testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=10,run_id="webshell",n_epoch=5) y_predict_list=model.predict(testX) y_predict=[] for i in y_predict_list: if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) do_metrics(y_test, y_predict)
def do_rnn(trainX, testX, trainY, testY): max_document_length=64 y_test=testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=64) net = tflearn.lstm(net, 64, dropout=0.1) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0,tensorboard_dir="dga_log") model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=10,run_id="dga",n_epoch=1) y_predict_list = model.predict(testX) #print y_predict_list y_predict = [] for i in y_predict_list: print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) print(classification_report(y_test, y_predict)) print metrics.confusion_matrix(y_test, y_predict)
def do_cnn(trainX, trainY,testX, testY): global n_words # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building convolutional network network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input') network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128) branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") network = merge([branch1, branch2, branch3], mode='concat', axis=1) network = tf.expand_dims(network, 2) network = global_max_pool(network) network = dropout(network, 0.5) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target') # Training model = tflearn.DNN(network, tensorboard_verbose=0) model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
def pad_sentences_qr(query, response, q_max_len, r_max_len, index): train_query = pad_sequences(query, maxlen=q_max_len, value=index) train_response = pad_sequences(response, maxlen=r_max_len, value=index) train_query = np.array(train_query) train_response = np.array(train_response) train_query_response = np.append(train_query, train_response, axis=1) return train_query, train_query_response, train_response, q_max_len, r_max_len, index+1
def do_rnn(trainX, testX, trainY, testY): global n_words # Data preprocessing # Sequence padding print "GET n_words embedding %d" % n_words trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, MAX_DOCUMENT_LENGTH]) net = tflearn.embedding(net, input_dim=n_words, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=3) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32,run_id="maidou")
def pad_SentencesQR(query, response): q_max_len, r_max_len, query_word, response_word, index = fenci(query, response) print("query max length:{}, response max length:{}".format(q_max_len, r_max_len)) train_query = pad_sequences(query_word, maxlen=q_max_len, value=index) train_response = pad_sequences(response_word, maxlen=r_max_len, value=index) # print train_query[0] # print train_response[0] train_query = np.array(train_query) train_response = np.array(train_response) train_query_response = np.append(train_query, train_response, axis=1) return train_query, train_query_response, train_response, q_max_len, r_max_len, index
def do_cnn(x,y): global max_document_length print "CNN and tf" trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0) y_test=testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building convolutional network network = input_data(shape=[None,max_document_length], name='input') network = tflearn.embedding(network, input_dim=1000000, output_dim=128) branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") network = merge([branch1, branch2, branch3], mode='concat', axis=1) network = tf.expand_dims(network, 2) network = global_max_pool(network) network = dropout(network, 0.8) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target') model = tflearn.DNN(network, tensorboard_verbose=0) #if not os.path.exists(pkl_file): # Training model.fit(trainX, trainY, n_epoch=5, shuffle=True, validation_set=0.1, show_metric=True, batch_size=100,run_id="webshell") # model.save(pkl_file) #else: # model.load(pkl_file) y_predict_list=model.predict(testX) #y_predict = list(model.predict(testX,as_iterable=True)) y_predict=[] for i in y_predict_list: print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) print 'y_predict_list:' print y_predict_list print 'y_predict:' print y_predict #print y_test do_metrics(y_test, y_predict)
def do_rnn(trainX, testX, trainY, testY): global max_sequences_len global max_sys_call # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=max_sequences_len, value=0.) testX = pad_sequences(testX, maxlen=max_sequences_len, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY_old=testY testY = to_categorical(testY, nb_classes=2) # Network building print "GET max_sequences_len embedding %d" % max_sequences_len print "GET max_sys_call embedding %d" % max_sys_call net = tflearn.input_data([None, max_sequences_len]) net = tflearn.embedding(net, input_dim=max_sys_call+1, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.3) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.1, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=3) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32,run_id="maidou") y_predict_list = model.predict(testX) #print y_predict_list y_predict = [] for i in y_predict_list: #print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) #y_predict=to_categorical(y_predict, nb_classes=2) print(classification_report(testY_old, y_predict)) print metrics.confusion_matrix(testY_old, y_predict)
def bi_lstm(trainX, trainY,testX, testY): trainX = pad_sequences(trainX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data(shape=[None, 200]) net = tflearn.embedding(net, input_dim=20000, output_dim=128) net = tflearn.bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128)) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') # Training model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2) model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64,run_id="rnn-bilstm")
def lstm(trainX, trainY,testX, testY): # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32,run_id="rnn-lstm")
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer_classification") # simple='simple' vocab_size = len(vocabulary_word2index) print("transformer_classification.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="transformer_classification") questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) print("list of total questions:",len(questionid_question_lists)) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) print("list of total questions2:",len(test)) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length print("list of total questions3:", len(testX2)) print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,l2_lambda=FLAGS.l2_lambda) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): logits=sess.run(model.logits,feed_dict={model.input_x:testX2[start:end],model.dropout_keep_prob:1}) #logits:[batch_size,self.num_classes] question_id_sublist=question_id_list[start:end] get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f) # 6. get lable using logtis #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) #print(index," ;predicted_labels:",predicted_labels) # 7. write question id and labels to file system. #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0): # IMDB Dataset loading train, test, _ = imdb.load_data( path=file_path, n_words=vocab_size, valid_portion=val_fraction, sort_by_len=False) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.) testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) train_dataset = DataSet(trainX, trainY) return train_dataset
def do_rnn_wordbag(trainX, testX, trainY, testY): global max_document_length print "RNN and wordbag" trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=10,run_id="review",n_epoch=5)
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network") questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model model = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass, use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint of EntityNet.") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): logits=sess.run(model.logits,feed_dict={model.query:testX2[start:end],model.story: np.expand_dims(testX2[start:end],axis=1), model.dropout_keep_prob:1.0}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) # 7. write question id and labels to file system. #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) question_id_sublist=question_id_list[start:end] get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) print("vocab_size:",vocab_size) #iii=0 #iii/0 vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) batch_size=1 index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)): logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="seq2seq_attention") # simple='simple' vocab_size = len(vocabulary_word2index) print("seq2seq_attention.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="seq2seq_attention",use_seq2seq=True) questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model model=seq2seq_attention_model(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,FLAGS.hidden_size, FLAGS.is_training,decoder_sent_length=FLAGS.decoder_sent_length,l2_lambda=FLAGS.l2_lambda) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') decoder_input=np.reshape(np.array([vocabulary_word2index_label[_GO]]+[vocabulary_word2index_label[_PAD]]*(FLAGS.decoder_sent_length-1)),[-1,FLAGS.decoder_sent_length]) for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): predictions,logits=sess.run([model.predictions,model.logits],feed_dict={model.input_x:testX2[start:end],model.decoder_input:decoder_input,model.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels=get_label_using_logits(logits[0],predictions,vocabulary_index2word_label,vocabulary_word2index_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def create_training_data(self): X = [] y = [] for k, v in self._sentences.items(): for sentence in v: word_ids = np.zeros(self._max_document_size, np.int64) for idx, token in enumerate(sentence): if idx >= self._max_document_size: break word_id = self._word_index.get(token) if word_id is None: word_ids[idx] = 0 else: word_ids[idx] = word_id X.append(word_ids) labels = self._labels_to_nums(k) y.append(labels) X = pad_sequences(X, maxlen=self._max_document_size, value=0.) y = [np.array(label) for label in y] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self._test_split, random_state=42) return X_train, X_test, y_train, y_test
def test(): v2i, _ = build_vocab() _, i2l = build_label() origin_questions = ['今天 天气 不错', '介绍 贵金属 产品'] questions = [q.split() for q in origin_questions] questions = [[v2i[vocab] for vocab in ques if vocab in v2i] for ques in questions] with tf.Session() as sess: saver = tf.train.import_meta_graph(checkpoint_path + model_name) saver.restore(sess, tf.train.latest_checkpoint(checkpoint_path)) model = tf.get_default_graph() x = model.get_tensor_by_name("x:0") predict = model.get_tensor_by_name("predictions:0") questions = pad_sequences(questions, maxlen=x.shape[1], value=0) feed_dict = {x: questions} p = sess.run([predict], feed_dict=feed_dict) p = p[0].tolist() for index in range(len(questions)): print(f'{origin_questions[index]} is_business: {i2l[p[index]]}')
def predictThis(model, sentence): ignore_words = ['?'] pattern_words = nltk.word_tokenize(sentence) # stem each word pattern_words = [ stemmer.stem(word.lower()) for word in pattern_words if word not in ignore_words ] encoded_sentence = [] for w in pattern_words: if w in words: #print(w, ' : ', words.index(w)) encoded_sentence.append(words.index(w)) #print(encoded_sentence) samples = [] samples.append(encoded_sentence) samples = pad_sequences(samples, maxlen=netprops.x_width, value=0.) preds = model.predict(samples) index, value = max(enumerate(preds[0]), key=operator.itemgetter(1)) print(sentence, ' : ', classes[index], ' : ', (value * 100), '%')
def get_data(): black_x, cdn_x, white_x = get_local_data() black_y, cdn_y, white_y = [LABEL.black] * len(black_x), [ LABEL.cdn ] * len(cdn_x), [LABEL.white] * len(white_x) X = black_x + cdn_x + white_x labels = black_y + cdn_y + white_y # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 print "max_features:", max_features maxlen = np.max([len(x) for x in X]) print "max_len:", maxlen maxlen = min(maxlen, 256) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = pad_sequences(X, maxlen=maxlen, value=0.) # Convert labels to 0-1 Y = to_categorical(labels, nb_classes=3) volcab_file = "volcab.pkl" output = open(volcab_file, 'wb') # Pickle dictionary using protocol 0. data = { "valid_chars": valid_chars, "max_len": maxlen, "volcab_size": max_features } pickle.dump(data, output) output.close() return X, Y, maxlen, max_features
def train(trainX, trainY, model_file): # Data preprocessing trainX = pad_sequences(trainX, maxlen=charvec_len, value=0.) trainY = to_categorical(trainY, nb_classes=2) net = bi_LSTM() # Training ''' tensorboard_verbose: 0: Loss, Accuracy (Best Speed) 1: Loss, Accuracy + Gradients 2: Loss, Accuracy, Gradients, Weights 3: Loss, Accuracy, Gradients, Weights, Activations, Sparsity (Best Visualization) ''' model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0, checkpoint_path='./chkpoint/', best_checkpoint_path='./best_chkpoint/', best_val_accuracy=0.9) # show_metric: If True, accuracy will be calculated and displayed # at every step. Might give slower training. model.fit(trainX, trainY, validation_set=0.1, show_metric=False, batch_size=128, n_epoch=1, run_id='bLSTM_i{}_{}k_d{}_o{}_d{}_adam_l{}_b{}'.format( charvec_len, in_dim // 1000, int(drop1 * 10), nn_dim, int(drop2 * 10), str(lrate).split('.')[1], nn_dim)) # Save model model.save(model_file)
def Example_P(article, entity, vocab, hps): # get ids of special tokens pad_id = vocab.word2id(PAD_TOKEN) """process the article""" # create vocab and word 2 id article_value = value2ids(article, vocab, hps.document_length) # word 2 id article_words = article2ids(article, vocab) # num sentence article_len = len(article) # word level padding article_words = pad_sequences(article_words, maxlen=hps.sequence_length, value=pad_id) # sentence level padding pad_article = np.expand_dims(np.zeros(hps.sequence_length, dtype=np.int32), axis=0) if article_words.shape[0] > hps.max_num_sequence: article_words = article_words[:hps.max_num_sequence] while article_words.shape[0] < hps.max_num_sequence: article_words = np.concatenate((article_words, pad_article)) return article_value, article_words, article_len
def train(trainX, trainY, model_file): print('# Data preprocessing') trainX = pad_sequences(trainX, maxlen=440, value=0.) trainY = to_categorical(trainY, nb_classes=2) print('build network') net = bi_LSTM() print('# Training') ''' tensorboard_verbose: 0: Loss, Accuracy (Best Speed) 1: Loss, Accuracy + Gradients 2: Loss, Accuracy, Gradients, Weights 3: Loss, Accuracy, Gradients, Weights, Activations, Sparsity (Best Visualization) ''' model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0, checkpoint_path='./chkpoint_mdm001/', best_checkpoint_path='./best_chkpoint_mdm001/', best_val_accuracy=0.9) print('tfl.DNN end.') model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=4, run_id='bilstm_170519b') print('model.fit end.') # Save model model.save(model_file) print('model save end.')
def load_data_multilabel(traning_data_path, vocab_word2index, vocab_label2index, sentence_len, training_portion=0.95): """ convert data as indexes using word2index dicts. :param traning_data_path: :param vocab_word2index: :param vocab_label2index: :return: """ file_object = codecs.open(traning_data_path, mode='r', encoding='utf-8') lines = file_object.readlines() random.shuffle(lines) label_size = len(vocab_label2index) X = [] Y = [] for i, line in enumerate(lines): raw_list = line.strip().split("__label__") input_list = raw_list[0].strip().split(" ") x = [vocab_word2index.get(x, UNK_ID) for x in input_list if x != ''] label_list = raw_list[1:] label_list = [vocab_label2index[label] for label in label_list] y = transform_multilabel_as_multihot(label_list, label_size) X.append(x) Y.append(y) X = pad_sequences(X, maxlen=sentence_len, value=0.) # padding to max length number_examples = len(lines) training_number = int(training_portion * number_examples) train = (X[0:training_number], Y[0:training_number]) valid_number = min(1000, number_examples - training_number) test = (X[training_number + 1:training_number + valid_number + 1], Y[training_number + 1:training_number + valid_number + 1]) return train, test
def machine_learning(comments): # 划分样本集和标签集 X = comments["content"] y = comments["quality"] # 分类训练集和测试集 random_state = 42 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=random_state) # 将样本集的字符串转变为数字序列。创建vocab,把X转化为X_word_ids vect = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b\w{1,}\b') vect.fit(X_train) vocab = vect.vocabulary_ def convert_X_to_X_word_ids(X): return X.apply(lambda x: [ vocab[w] for w in [w.lower().strip() for w in x.split()] if w in vocab ]) X_train_word_ids = convert_X_to_X_word_ids(X_train) X_test_word_ids = convert_X_to_X_word_ids(X_test) # 序列扩充 X_test_padded_seqs = pad_sequences(X_test_word_ids, maxlen=20, value=0) X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=20, value=0) # 标签集处理 unique_y_labels = list(y_train.value_counts().index) le = preprocessing.LabelEncoder() le.fit(unique_y_labels) y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), nb_classes=len(unique_y_labels)) y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]), nb_classes=len(unique_y_labels)) # 构造网络 n_epoch = 10 size_of_each_vector = X_train_padded_seqs.shape[1] vocab_size = len(vocab) no_of_unique_y_labels = len(unique_y_labels) net = tflearn.input_data([None, size_of_each_vector]) net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.6) net = tflearn.fully_connected(net, no_of_unique_y_labels, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=1e-4, loss='categorical_crossentropy') # 训练网络 # 初始化 model = tflearn.DNN(net, tensorboard_verbose=0, tensorboard_dir="./tflearn_data/tflearn_logs/") # 训练 model.fit(X_train_padded_seqs, y_train, validation_set=(X_test_padded_seqs, y_test), n_epoch=n_epoch, show_metric=True, batch_size=100) # 保存 time = datetime.now() time_str = str(time).replace(":", ".") os.makedirs( f"./tflearn_data/tflearn_models/{time_str}({n_epoch}, {random_state})") model.save( f"./tflearn_data/tflearn_models/{time_str}({n_epoch}, {random_state})/model" )
def comment_predict(data): """ 根据已有模型对评论倾向进行预测 :return: """ # 建立模型时用到的评论数据 predict_data = pd.read_csv("/var/www/test/python/comments_tag.csv") def chinese_word_cut(text): """ 使用结巴分词对中文进行切分转化为独立的词语 :param text: 完整的评论 :return: 切分后的评论 """ return " ".join(jieba.cut(text)) # 进行分词并新建一列保存结果 predict_data["cut_comment"] = predict_data.comment.apply(chinese_word_cut) # 确定评论部分(X)和标签部分(y) X = predict_data["cut_comment"] y = predict_data["evaluation"] # 对数据集进行切分,分为训练集(train)和测试集(test) # 这里随机数种子要和建立模型时的随机数种子一样 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) def get_custom_stopwords(stop_words_file): """ 得到停用词表 :param stop_words_file: :return: 停用词表list """ with open(stop_words_file, encoding="utf-8") as f: stopwords = f.read() stopwords_list = stopwords.split("\n") custom_stopwords_list = [i for i in stopwords_list] return custom_stopwords_list # 得到停用词表 stop_words_file = "/var/www/test/python/哈工大停用词表.txt" stopwords = get_custom_stopwords(stop_words_file) # 计算特征数值 vect = CountVectorizer(max_df=0.8, min_df=3, token_pattern=u'(?u)\\b\\w+\\b', stop_words=frozenset(stopwords)) vect.fit(X_train) vocab = vect.vocabulary_ def convert_X_to_X_word_ids(X): """ 将评论(文字部分)转化为id集(数值序列) :param X:评论集合 :return:数值序列 """ return X.apply(lambda x: [ vocab[w] for w in [w.lower().strip() for w in x.split()] if w in vocab ]) # 序列扩充,统一延长到长度为20的序列,使得评论序列格式相同,不足的用0代替 X_train_word_ids = convert_X_to_X_word_ids(X_train) X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=20, value=0) # 标签集处理 unique_y_labels = list(y_train.value_counts().index) le = preprocessing.LabelEncoder() le.fit(unique_y_labels) # 构造网络 size_of_each_vector = X_train_padded_seqs.shape[1] vocab_size = len(vocab) no_of_unique_y_labels = len(unique_y_labels) net = tflearn.input_data([None, size_of_each_vector]) net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.6) net = tflearn.fully_connected(net, no_of_unique_y_labels, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=1e-4, loss='categorical_crossentropy') # 初始化 model = tflearn.DNN(net, tensorboard_verbose=0) # 加载模型 model.load( "/var/www/test/python/2019-07-10 20.03.06.175272(1000, 42)/model") # ———————————————————————————————————————预测部分——————————————————————————————————————— # 待预测的评论数据 predict_data = data # 对评论数据进行分词 predict_data["cut_comment"] = predict_data.comment.apply(chinese_word_cut) # 设置预测集 predict_X = predict_data["cut_comment"] # 转化为数值序列 predict_X_word_ids = convert_X_to_X_word_ids(predict_X) predict_X_padded_seqs = pad_sequences(predict_X_word_ids, maxlen=20, value=0) # 进行预测并得到结果 predict_Y = model.predict(predict_X_padded_seqs) # 输出结果 get_evaluation(predict_Y)
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label,_ = create_voabulary_label() train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label,data_type='train') trainX, trainY = train testX, testY = test print("testX.shape:", np.array(testX).shape) # 2500个list.每个list代表一句话 print("testY.shape:", np.array(testY).shape) # 2500个label print("testX[0]:", testX[0]) # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] print("testX[1]:", testX[1]); print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 # 2.Data preprocessing # Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length ############################################################################################### #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) ############################################################################################### print("testX[0]:", testX[0]) ;print("testX[1]:", testX[1]); #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] # Converting labels to binary vectors print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text) curr_epoch=sess.run(fast_text.epoch_step) #3.feed data & training number_of_training_data=len(trainX) batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs):#range(start,stop,step_size) loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end]) print("trainY[start:end]:",trainY[start:end]) curr_loss,curr_acc,_=sess.run([fast_text.loss_val,fast_text.accuracy,fast_text.train_op],feed_dict={fast_text.sentence:trainX[start:end],fast_text.labels:trainY[start:end]}) loss,acc,counter=loss+curr_loss,acc+curr_acc,counter+1 if counter %500==0: print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #epoch increment print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) # 4.validation print(epoch,FLAGS.validate_every,(epoch % FLAGS.validate_every==0)) if epoch % FLAGS.validate_every==0: eval_loss, eval_acc=do_eval(sess,fast_text,testX,testY,batch_size) print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_acc)) #save model to checkpoint save_path=FLAGS.ckpt_dir+"model.ckpt" saver.save(sess,save_path,global_step=fast_text.epoch_step) #fast_text.epoch_step # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size) pass
def test_pad(): trainX='w18476 w4454 w1674 w6 w25 w474 w1333 w1467 w863 w6 w4430 w11 w813 w4463 w863 w6 w4430 w111' trainX=trainX.split(" ") trainX = pad_sequences([[trainX]], maxlen=100, value=0.) print("trainX:",trainX)
from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.embedding_ops import embedding from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell from tflearn.layers.estimator import regression # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = input_data(shape=[None, 200]) net = embedding(net, input_dim=20000, output_dim=128) net = bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128)) net = dropout(net, 0.5) net = fully_connected(net, 2, activation='softmax') net = regression(net, optimizer='adam', loss='categorical_crossentropy') # Training model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary( simple='simple', word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn2") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope="cnn2") questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # 4.Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data = len(testX2) print("number_of_training_data:", number_of_training_data) index = 0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip( range(0, number_of_training_data, FLAGS.batch_size), range(FLAGS.batch_size, number_of_training_data + 1, FLAGS.batch_size)): logits = sess.run(textCNN.logits, feed_dict={ textCNN.input_x: testX2[start:end], textCNN.dropout_keep_prob: 1 }) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels = get_label_using_logits( logits[0], vocabulary_index2word_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index], predicted_labels, predict_target_file_f) index = index + 1 predict_target_file_f.close()
def main(_): #os.environ['CUDA_VISIBLE_DEVICES'] = '' if FLAGS.dataset == "bibsonomy-clean": word2vec_model_path = FLAGS.word2vec_model_path_bib traning_data_path = FLAGS.training_data_path_bib FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 11.59 elif FLAGS.dataset == "zhihu-sample": word2vec_model_path = FLAGS.word2vec_model_path_zhihu traning_data_path = FLAGS.training_data_path_zhihu FLAGS.sequence_length = 100 FLAGS.ave_labels_per_doc = 2.45 elif FLAGS.dataset == "citeulike-a-clean": word2vec_model_path = FLAGS.word2vec_model_path_cua traning_data_path = FLAGS.training_data_path_cua FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 11.6 elif FLAGS.dataset == "citeulike-t-clean": word2vec_model_path = FLAGS.word2vec_model_path_cut traning_data_path = FLAGS.training_data_path_cut FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 7.68 # 1. create trainlist, validlist and testlist trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( word2vec_model_path, name_scope=FLAGS.dataset + "-lda") #simple='simple' vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( voabulary_label=traning_data_path, name_scope=FLAGS.dataset + "-lda") num_classes = len(vocabulary_word2index_label) print(vocabulary_index2word_label[0], vocabulary_index2word_label[1]) vocab_size = len(vocabulary_word2index) print("vocab_size:", vocab_size) # choosing whether to use k-fold cross-validation or hold-out validation if FLAGS.kfold == -1: # hold-out train, valid, test = load_data_multilabel_new( vocabulary_word2index, vocabulary_word2index_label, keep_label_percent=FLAGS.keep_label_percent, valid_portion=FLAGS.valid_portion, test_portion=FLAGS.test_portion, multi_label_flag=FLAGS.multi_label_flag, traning_data_path=traning_data_path) # here train, test are tuples; turn train into trainlist. trainlist, validlist, testlist = list(), list(), list() trainlist.append(train) validlist.append(valid) testlist.append(test) else: # k-fold trainlist, validlist, testlist = load_data_multilabel_new_k_fold( vocabulary_word2index, vocabulary_word2index_label, keep_label_percent=FLAGS.keep_label_percent, kfold=FLAGS.kfold, test_portion=FLAGS.test_portion, multi_label_flag=FLAGS.multi_label_flag, traning_data_path=traning_data_path) # here trainlist, testlist are list of tuples. # get and pad testing data: there is only one testing data, but kfold training and validation data assert len(testlist) == 1 testX, testY = testlist[0] testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length # 3. transform trainlist to the format. x_train, x_test: training and test feature matrices of size (n_samples, n_features) #print(len(trainlist)) #trainX,trainY = trainlist[0] #trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) #print(len(trainX)) #print(len(trainX[0])) #print(trainX[0]) #print(len(trainY)) #print(len(trainY[0])) #print(trainY[0]) #print(np.asarray(trainY).shape) num_runs = len(trainlist) #validation results variables valid_acc_th, valid_prec_th, valid_rec_th, valid_fmeasure_th, valid_hamming_loss_th = [ 0 ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [ 0 ] * num_runs # initialise the result lists final_valid_acc_th, final_valid_prec_th, final_valid_rec_th, final_valid_fmeasure_th, final_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 min_valid_acc_th, min_valid_prec_th, min_valid_rec_th, min_valid_fmeasure_th, min_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 max_valid_acc_th, max_valid_prec_th, max_valid_rec_th, max_valid_fmeasure_th, max_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 std_valid_acc_th, std_valid_prec_th, std_valid_rec_th, std_valid_fmeasure_th, std_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 #testing results variables test_acc_th, test_prec_th, test_rec_th, test_fmeasure_th, test_hamming_loss_th = [ 0 ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [ 0 ] * num_runs # initialise the testing result lists final_test_acc_th, final_test_prec_th, final_test_rec_th, final_test_fmeasure_th, final_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 min_test_acc_th, min_test_prec_th, min_test_rec_th, min_test_fmeasure_th, min_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 max_test_acc_th, max_test_prec_th, max_test_rec_th, max_test_fmeasure_th, max_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 std_test_acc_th, std_test_prec_th, std_test_rec_th, std_test_fmeasure_th, std_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 #output variables output_valid = "" output_test = "" output_csv_valid = "fold,hamming_loss,acc,prec,rec,f1" output_csv_test = "fold,hamming_loss,acc,prec,rec,f1" time_train = [0] * num_runs # get time spent in training num_run = 0 mallet_path = FLAGS.mallet_path num_topics = FLAGS.num_topics alpha = 50 / num_topics iterations = FLAGS.iterations k_num_doc = FLAGS.k_num_doc remove_pad_id = True remove_dot = True docs_test = generateLDAdocFromIndex(testX, vocabulary_index2word, remove_pad_id=remove_pad_id, remove_dot=remove_dot) for trainfold in trainlist: # get training and validation data trainX, trainY = trainfold trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # generate training data for gensim MALLET wrapper for LDA docs = generateLDAdocFromIndex(trainX, vocabulary_index2word, remove_pad_id=remove_pad_id, remove_dot=remove_dot) #print(docs[10]) id2word = corpora.Dictionary(docs) corpus = [id2word.doc2bow(text) for text in docs] #print(corpus[10]) # generate validation data for gensim MALLET wrapper for LDA validX, validY = validlist[num_run] validX = pad_sequences(validX, maxlen=FLAGS.sequence_length, value=0.) docs_valid = generateLDAdocFromIndex(validX, vocabulary_index2word, remove_pad_id=remove_pad_id, remove_dot=remove_dot) corpus_valid = [id2word.doc2bow(text) for text in docs_valid] # generate testing data for gensim MALLET wrapper for LDA corpus_test = [id2word.doc2bow(text) for text in docs_test] # training start_time_train = time.time() print('start training fold', str(num_run)) model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, alpha=alpha, id2word=id2word, iterations=iterations) pprint(model.show_topics(formatted=False)) print('num_run', str(num_run), 'train done.') time_train[num_run] = time.time() - start_time_train print("--- training of fold %s took %s seconds ---" % (num_run, time_train[num_run])) # represent each document as a topic vector #mat_train = np.array(model[corpus]) # this will cause an Error with large num_topics, e.g. 1000 or higher. #Thus, we turn the MALLET LDA model to a native Gensim LDA model model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model) mat_train = np.array( model.get_document_topics(corpus, minimum_probability=0.0)) #print(len(model[corpus[0]])) #print(len(model[corpus[1]])) #print(len(model[corpus[2]])) #print(mat_train.shape) mat_train = mat_train[:, :, 1] # documents in training set as a matrix of topic probabilities # evaluate on training data #if num_run == 0 and FLAGS.kfold != -1: # do this only for the first fold in k-fold cross-validation to save time # acc, prec, rec, f_measure, hamming_loss = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus, trainY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) # print('training:', acc, prec, rec, f_measure, hamming_loss) # validation valid_acc_th[num_run], valid_prec_th[num_run], valid_rec_th[ num_run], valid_fmeasure_th[num_run], valid_hamming_loss_th[ num_run] = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus_valid, validY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) print( "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run], valid_prec_th[num_run], valid_rec_th[num_run], valid_fmeasure_th[num_run])) output_valid = output_valid + "\n" + "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % ( num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run], valid_prec_th[num_run], valid_rec_th[num_run], valid_fmeasure_th[ num_run]) + "\n" # also output the results of each run. output_csv_valid = output_csv_valid + "\n" + str(num_run) + "," + str( valid_hamming_loss_th[num_run]) + "," + str( valid_acc_th[num_run]) + "," + str( valid_prec_th[num_run]) + "," + str( valid_rec_th[num_run]) + "," + str( valid_fmeasure_th[num_run]) start_time_test = time.time() # evaluate on testing data test_acc_th[num_run], test_prec_th[num_run], test_rec_th[ num_run], test_fmeasure_th[num_run], test_hamming_loss_th[ num_run] = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus_test, testY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) print( "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (num_run, test_acc_th[num_run], test_hamming_loss_th[num_run], test_prec_th[num_run], test_rec_th[num_run], test_fmeasure_th[num_run])) output_test = output_test + "\n" + "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % ( num_run, test_acc_th[num_run], test_hamming_loss_th[num_run], test_prec_th[num_run], test_rec_th[num_run], test_fmeasure_th[ num_run]) + "\n" # also output the results of each run. output_csv_test = output_csv_test + "\n" + str(num_run) + "," + str( test_hamming_loss_th[num_run]) + "," + str( test_acc_th[num_run]) + "," + str( test_prec_th[num_run]) + "," + str( test_rec_th[num_run]) + "," + str( test_fmeasure_th[num_run]) print("--- testing of fold %s took %s seconds ---" % (num_run, time.time() - start_time_test)) prediction_str = "" # output final predictions for qualitative analysis if FLAGS.report_rand_pred == True: prediction_str = display_for_qualitative_evaluation( model, k_num_doc, mat_train, trainY, corpus_test, testX, testY, vocabulary_index2word, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) # update the num_run num_run = num_run + 1 print('\n--Final Results--\n') #print('C', FLAGS.C, 'gamma', FLAGS.gamma) # report min, max, std, average for the validation results min_valid_acc_th = min(valid_acc_th) min_valid_prec_th = min(valid_prec_th) min_valid_rec_th = min(valid_rec_th) min_valid_fmeasure_th = min(valid_fmeasure_th) min_valid_hamming_loss_th = min(valid_hamming_loss_th) max_valid_acc_th = max(valid_acc_th) max_valid_prec_th = max(valid_prec_th) max_valid_rec_th = max(valid_rec_th) max_valid_fmeasure_th = max(valid_fmeasure_th) max_valid_hamming_loss_th = max(valid_hamming_loss_th) if FLAGS.kfold != -1: std_valid_acc_th = statistics.stdev(valid_acc_th) # to change std_valid_prec_th = statistics.stdev(valid_prec_th) std_valid_rec_th = statistics.stdev(valid_rec_th) std_valid_fmeasure_th = statistics.stdev(valid_fmeasure_th) std_valid_hamming_loss_th = statistics.stdev(valid_hamming_loss_th) final_valid_acc_th = sum(valid_acc_th) / num_runs final_valid_prec_th = sum(valid_prec_th) / num_runs final_valid_rec_th = sum(valid_rec_th) / num_runs final_valid_fmeasure_th = sum(valid_fmeasure_th) / num_runs final_valid_hamming_loss_th = sum(valid_hamming_loss_th) / num_runs print( "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_valid_acc_th, std_valid_acc_th, min_valid_acc_th, max_valid_acc_th, final_valid_hamming_loss_th, std_valid_hamming_loss_th, min_valid_hamming_loss_th, max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th, min_valid_prec_th, max_valid_prec_th, final_valid_rec_th, std_valid_rec_th, min_valid_rec_th, max_valid_rec_th, final_valid_fmeasure_th, std_valid_fmeasure_th, min_valid_fmeasure_th, max_valid_fmeasure_th)) #output the result to a file output_valid = output_valid + "\n" + "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % ( final_valid_acc_th, std_valid_acc_th, min_valid_acc_th, max_valid_acc_th, final_valid_hamming_loss_th, std_valid_hamming_loss_th, min_valid_hamming_loss_th, max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th, min_valid_prec_th, max_valid_prec_th, final_valid_rec_th, std_valid_rec_th, min_valid_rec_th, max_valid_rec_th, final_valid_fmeasure_th, std_valid_fmeasure_th, min_valid_fmeasure_th, max_valid_fmeasure_th) + "\n" output_csv_valid = output_csv_valid + "\n" + "average" + "," + str( round(final_valid_hamming_loss_th, 3)) + "±" + str( round(std_valid_hamming_loss_th, 3) ) + "," + str(round(final_valid_acc_th, 3)) + "±" + str( round(std_valid_acc_th, 3)) + "," + str( round(final_valid_prec_th, 3)) + "±" + str( round(std_valid_prec_th, 3)) + "," + str( round(final_valid_rec_th, 3)) + "±" + str( round(std_valid_rec_th, 3)) + "," + str( round(final_valid_fmeasure_th, 3)) + "±" + str( round(std_valid_fmeasure_th, 3)) # report min, max, std, average for the testing results min_test_acc_th = min(test_acc_th) min_test_prec_th = min(test_prec_th) min_test_rec_th = min(test_rec_th) min_test_fmeasure_th = min(test_fmeasure_th) min_test_hamming_loss_th = min(test_hamming_loss_th) max_test_acc_th = max(test_acc_th) max_test_prec_th = max(test_prec_th) max_test_rec_th = max(test_rec_th) max_test_fmeasure_th = max(test_fmeasure_th) max_test_hamming_loss_th = max(test_hamming_loss_th) if FLAGS.kfold != -1: std_test_acc_th = statistics.stdev(test_acc_th) # to change std_test_prec_th = statistics.stdev(test_prec_th) std_test_rec_th = statistics.stdev(test_rec_th) std_test_fmeasure_th = statistics.stdev(test_fmeasure_th) std_test_hamming_loss_th = statistics.stdev(test_hamming_loss_th) final_test_acc_th = sum(test_acc_th) / num_runs final_test_prec_th = sum(test_prec_th) / num_runs final_test_rec_th = sum(test_rec_th) / num_runs final_test_fmeasure_th = sum(test_fmeasure_th) / num_runs final_test_hamming_loss_th = sum(test_hamming_loss_th) / num_runs print( "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th, final_test_hamming_loss_th, std_test_hamming_loss_th, min_test_hamming_loss_th, max_test_hamming_loss_th, final_test_prec_th, std_test_prec_th, min_test_prec_th, max_test_prec_th, final_test_rec_th, std_test_rec_th, min_test_rec_th, max_test_rec_th, final_test_fmeasure_th, std_test_fmeasure_th, min_test_fmeasure_th, max_test_fmeasure_th)) #output the result to a file output_test = output_test + "\n" + "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % ( final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th, final_test_hamming_loss_th, std_test_hamming_loss_th, min_test_hamming_loss_th, max_test_hamming_loss_th, final_test_prec_th, std_test_prec_th, min_test_prec_th, max_test_prec_th, final_test_rec_th, std_test_rec_th, min_test_rec_th, max_test_rec_th, final_test_fmeasure_th, std_test_fmeasure_th, min_test_fmeasure_th, max_test_fmeasure_th) + "\n" output_csv_test = output_csv_test + "\n" + "average" + "," + str( round(final_test_hamming_loss_th, 3)) + "±" + str( round(std_test_hamming_loss_th, 3)) + "," + str( round(final_test_acc_th, 3) ) + "±" + str(round(std_test_acc_th, 3)) + "," + str( round(final_test_prec_th, 3)) + "±" + str( round(std_test_prec_th, 3)) + "," + str( round(final_test_rec_th, 3)) + "±" + str( round(std_test_rec_th, 3)) + "," + str( round(final_test_fmeasure_th, 3)) + "±" + str( round(std_test_fmeasure_th, 3)) setting = "dataset:" + str(FLAGS.dataset) + "\nT: " + str( FLAGS.num_topics) + "\nk: " + str(FLAGS.k_num_doc) + ' \ni: ' + str( FLAGS.iterations) print("--- The whole program took %s seconds ---" % (time.time() - start_time)) time_used = "--- The whole program took %s seconds ---" % (time.time() - start_time) if FLAGS.kfold != -1: print("--- The average training took %s ± %s seconds ---" % (sum(time_train) / num_runs, statistics.stdev(time_train))) average_time_train = "--- The average training took %s ± %s seconds ---" % ( sum(time_train) / num_runs, statistics.stdev(time_train)) else: print("--- The average training took %s ± %s seconds ---" % (sum(time_train) / num_runs, 0)) average_time_train = "--- The average training took %s ± %s seconds ---" % ( sum(time_train) / num_runs, 0) # output setting configuration, results, prediction and time used output_to_file( 'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' + str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' + str(FLAGS.marking_id) + '.txt', setting + '\n' + output_valid + '\n' + output_test + '\n' + prediction_str + '\n' + time_used + '\n' + average_time_train) # output structured evaluation results output_to_file( 'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' + str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' + str(FLAGS.marking_id) + ' valid.csv', output_csv_valid) output_to_file( 'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' + str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' + str(FLAGS.marking_id) + ' test.csv', output_csv_test)
def load_pred_data(data_path, vocab_word2index, vocab_char2index, vocab_pos2index, vocab_cap2index, sentence_len, word_len, flag_use_char, flag_use_pos, flag_use_cap): """ :param data_path: :param vocab_label2index: :param vocab_word2index: :param vocab_char2index: :param vocab_pos2index: :param vocab_cap2index: :param sentence_len: max length of word sequence :param word_len: max length of char sequence :return: X: [ word_sequence, char_sequence, pos_sequence, cap_sequence] - word_sequence: sentence_len - char_sequence: sentence_len * word_len - pos_sequence: sentence_len - cap_sequence: sentence_len """ data_file = codecs.open(data_path, mode='r', encoding='utf-8') data_lines = data_file.readlines() # build data samples: Word_sequences = [] Char_sequences = [] Pos_sequences = [] Cap_sequences = [] for i, line in enumerate(data_lines): raw_list = line.strip().split("\t") input_list = raw_list[1].split(" ") # get word lists word_sequence = [vocab_word2index.get(x, UNK_ID) for x in input_list] Word_sequences.append(word_sequence) # get char lists if flag_use_char: char_sequence = [] # [sentence_len, word_len] for word in input_list: char_indexs = [vocab_char2index.get(char, UNK_ID) for char in word] char_sequence.append(char_indexs) if len(input_list) < sentence_len: char_sequence.extend( [[0]] * (sentence_len-len(input_list))) else: char_sequence = char_sequence[:sentence_len] char_sequence = pad_sequences(char_sequence, maxlen=word_len, value=0.) Char_sequences.append(char_sequence) if flag_use_pos: pos_sequence = nltk.pos_tag(input_list) # [sentence_len] word_seq, pos_seq = zip(*pos_sequence) pos_sequence = list(pos_seq) pos_sequence = [vocab_pos2index.get(pos, UNK_ID) for pos in pos_sequence] Pos_sequences.append(pos_sequence) if flag_use_cap: cap_sequence = [word_capitalize(word) for word in input_list] cap_sequence = [vocab_cap2index[cap] for cap in cap_sequence] Cap_sequences.append(cap_sequence) Word_sequences = pad_sequences(Word_sequences, maxlen=sentence_len, value=0.) if flag_use_pos: Pos_sequences = pad_sequences(Pos_sequences, maxlen=sentence_len, value=0.) if flag_use_cap: Cap_sequences = pad_sequences(Cap_sequences, maxlen=sentence_len, value=0.) X = {'word':np.array(Word_sequences), 'char':np.array(Char_sequences), 'pos':np.array(Pos_sequences), 'cap':np.array(Cap_sequences)} return X, data_lines
print("training data not exist==>load data, and dump it to file system") vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size=len(vocabulary_word2index) vocabulary_word2index_label = create_voabulary_label() train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label) trainX, trainY = train testX, testY = test print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话 print("testY.shape:",np.array(testY).shape) #2500个label print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0 # 2.Data preprocessing # Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length testX = pad_sequences(testX, maxlen=100, value=0.) #padding to max length # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=number_classes) #y as one hot testY = to_categorical(testY, nb_classes=number_classes) #y as one hot print("end padding & transform to one hot...") #-------------------------------------------------------------------------------------------------- # cache trainX,trainY,testX,testY for next time use. # with open(f_cache, 'w') as f: # pickle.dump((trainX,trainY,testX,testY,vocab_size),f) #else: # print("traning data exists in cache. going to use it.") # 3.Building convolutional network ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC############################################################################################## #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
def main(_): trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label() train,test = load_data_with_multilabels(vocabulary_word2index, vocabulary_word2index_label,FLAGS.training_path) #[1,11,3,1998,1998] trainX, trainY= train #TODO trainY1999 testX, testY = test #TODO testY1999 print("testX.shape:", np.array(testX).shape);print("testY.shape:", np.array(testY).shape) # 2500个label # 2.Data preprocessing # Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text) curr_epoch=sess.run(fast_text.epoch_step) #3.feed data & training number_of_training_data=len(trainX) batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs):#range(start,stop,step_size) loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end]) #2d-array. each element slength is a 100. print("trainY[start:end]:",trainY[start:end]) #a list,each element is a list.element:may be has 1,2,3,4,5 labels. #print("trainY1999[start:end]:",trainY1999[start:end]) curr_loss,_=sess.run([fast_text.loss_val,fast_text.train_op],feed_dict={fast_text.sentence:trainX[start:end],fast_text.labels:trainY[start:end],}) #fast_text.labels_l1999:trainY1999[start:end] loss,counter=loss+curr_loss,counter+1 #acc+curr_acc, if counter %500==0: print("Epoch %d\tBatch %d\tTrain Loss:%.3f" %(epoch,counter,loss/float(counter))) #\tTrain Accuracy:%.3f--->,acc/float(counter) #epoch increment print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) # 4.validation print("epoch:",epoch,"validate_every:",FLAGS.validate_every,"validate or not:",(epoch % FLAGS.validate_every==0)) if epoch % FLAGS.validate_every==0: eval_loss,eval_accuracy=do_eval(sess,fast_text,testX,testY,batch_size,vocabulary_index2word_label) #testY1999,eval_acc print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_accuracy)) #,\tValidation Accuracy: %.3f--->eval_acc #save model to checkpoint save_path=FLAGS.ckpt_dir+"model.ckpt" saver.save(sess,save_path,global_step=epoch) #fast_text.epoch_step # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, fast_text, testX, testY,batch_size,vocabulary_index2word_label) #testY1999 pass
# -*- coding: utf-8 -*- # Apply an LSTM to IMDB sentiment dataset classification task import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # IMDB Dataset loading 会自动下载 train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train #22500个元素的list,每个元素类似这样[17,25,10,406,26,14,556,61,62,323,4],可见是词在词库的位置 testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.)# 通过补零把list的每个元素长度都弄成100 testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2)#把0变成[0,1],把1变成[1,0] testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple', word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn2") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True graph=tf.Graph().as_default() global sess global textCNN with graph: sess=tf.Session(config=config) # 4.Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network") questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX2_cnn = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length, for CNN print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True graph1 = tf.Graph().as_default() graph2 = tf.Graph().as_default() graph3 = tf.Graph().as_default() graph4 = tf.Graph().as_default() graph5 = tf.Graph().as_default() global sess_dmn global sess_entity global sess_cnn global sess_rcnn with graph1:#DynamicMemoryNetwork sess_dmn = tf.Session(config=config) model_dmn = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass, use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda) saver_dmn = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir_dmn + "checkpoint"): print("Restoring Variables from Checkpoint of DMN.") saver_dmn.restore(sess_dmn, tf.train.latest_checkpoint(FLAGS.ckpt_dir_dmn)) else: print("Can't find the checkpoint.going to stop.DMN") return with graph2:#EntityNet sess_entity = tf.Session(config=config) model_entity = EntityNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training, multi_label_flag=True, block_size=FLAGS.block_size,use_bi_lstm=FLAGS.use_bi_lstm) saver_entity = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir_entity + "checkpoint"): print("Restoring Variables from Checkpoint of EntityNet.") saver_entity.restore(sess_entity, tf.train.latest_checkpoint(FLAGS.ckpt_dir_entity)) else: print("Can't find the checkpoint.going to stop.EntityNet.") return with graph3:#TextCNN sess_cnn=tf.Session(config=config) model_cnn = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) saver_cnn = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir_cnn + "checkpoint"): print("Restoring Variables from Checkpoint.TextCNN.") saver_cnn.restore(sess_cnn, tf.train.latest_checkpoint(FLAGS.ckpt_dir_cnn)) else: print("Can't find the checkpoint.going to stop.TextCNN.") return with graph5: #TextCNN_256embedding sess_cnn_256_embedding = tf.Session(config=config) model_cnn_256_embedding = TextCNN(filter_sizes_256_embedding, FLAGS.num_filters_256_embedding, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size,FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size_256_embedding, FLAGS.is_training) saver_cnn_256_embedding = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir_cnn_256_embedding + "checkpoint"): print("Restoring Variables from Checkpoint.TextCNN_256_embedding") saver_cnn_256_embedding.restore(sess_cnn_256_embedding, tf.train.latest_checkpoint(FLAGS.ckpt_dir_cnn_256_embedding)) else: print("Can't find the checkpoint.going to stop.TextCNN_256_embedding.") return #with graph4:#RCNN # sess_rcnn=tf.Session(config=config) # model_rcnn=TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_len, # vocab_size,FLAGS.embed_size,FLAGS.is_training,FLAGS.batch_size,multi_label_flag=FLAGS.multi_label_flag) # saver_rcnn = tf.train.Saver() # if os.path.exists(FLAGS.ckpt_dir_rcnn + "checkpoint"): # print("Restoring Variables from Checkpoint.TextRCNN.") # saver_rcnn.restore(sess_rcnn, tf.train.latest_checkpoint(FLAGS.ckpt_dir_rcnn)) # else: # print("Can't find the checkpoint.going to stop.TextRCNN.") # return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') global sess_dmn global sess_entity for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): #1.DMN logits_dmn=sess_dmn.run(model_dmn.logits,feed_dict={model_dmn.query:testX2[start:end],model_dmn.story: np.expand_dims(testX2[start:end],axis=1), model_dmn.dropout_keep_prob:1.0}) #2.EntityNet logits_entity=sess_entity.run(model_entity.logits,feed_dict={model_entity.query:testX2[start:end],model_entity.story: np.expand_dims(testX2[start:end],axis=1), model_entity.dropout_keep_prob:1.0}) #3.CNN logits_cnn = sess_cnn.run(model_cnn.logits,feed_dict={model_cnn.input_x: testX2_cnn[start:end], model_cnn.dropout_keep_prob: 1}) #4.RCNN #logits_rcnn = sess_rcnn.run(model_rcnn.logits, feed_dict={model_rcnn.input_x: testX2_cnn[start:end],model_rcnn.dropout_keep_prob: 1}) # 'shape of logits:', ( 1, 1999) #5.CN_256_original_embeddding logits_cnn_256_embedding =sess_cnn_256_embedding.run(model_cnn_256_embedding.logits,feed_dict={model_cnn_256_embedding.input_x: testX2_cnn[start:end], model_cnn_256_embedding.dropout_keep_prob: 1}) #how to combine to logits: average logits=logits_cnn*0.3+logits_cnn_256_embedding*0.3+logits_entity*0.2+logits_dmn*0.2#+logits_rcnn*0.15 question_id_sublist=question_id_list[start:end] get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, _ = create_voabulary_label() train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label, data_type='train') trainX, trainY = train testX, testY = test print("testX.shape:", np.array(testX).shape) # 2500个list.每个list代表一句话 print("testY.shape:", np.array(testY).shape) # 2500个label print("testX[0]:", testX[0]) # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] print("testX[1]:", testX[1]) print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 # 2.Data preprocessing # Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length ############################################################################################### #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) ############################################################################################### print("testX[0]:", testX[0]) print("testX[1]:", testX[1]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] # Converting labels to binary vectors print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 print("end padding & transform to one hot...") #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.num_sampled, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text) curr_epoch = sess.run(fast_text.epoch_step) #3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): #range(start,stop,step_size) loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) print("trainY[start:end]:", trainY[start:end]) curr_loss, curr_acc, _ = sess.run( [ fast_text.loss_val, fast_text.accuracy, fast_text.train_op ], feed_dict={ fast_text.sentence: trainX[start:end], fast_text.labels: trainY[start:end] }) loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1 if counter % 500 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter))) #epoch increment print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY, batch_size) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save( sess, save_path, global_step=fast_text.epoch_step) #fast_text.epoch_step # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size) pass
min_frequency = 2 vp = tflearn.data_utils.VocabularyProcessor(max_tweet_length, min_frequency=min_frequency) vp = vp.fit(tweets) val = len(vp.vocabulary_) print(val) tweets_parsed = vp.transform(tweets) vp.save('my_dictionary') print(vp) trainX = tweets_parsed trainY = tflearn.data_utils.to_categorical(content1, nb_classes=0) filtered_gen = (item for item in trainX) gen_to_list = list(filtered_gen) trainX1 = pad_sequences(gen_to_list, maxlen=120, value=0.) #print(trainX1) # Network building net = tflearn.input_data([None, 120]) net = tflearn.embedding(net, input_dim=val, output_dim=64) net = tflearn.lstm(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam',loss='binary_crossentropy') # Training model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
print(train_y.values) train_y = np.reshape(train_y.values, (-1, 1)) test_y = np.reshape(test_y.values, (-1, 1)) #train_y = tf.constant(train_y.values) print(test_y) # Data preprocessing embeddings, vocab = get_embeddings(efile_name) print(embeddings.shape) vocab_size, embeddings_dim = embeddings.shape print(vocab_size, embeddings_dim) train_x, test_x = vectorize(train_x, test_x, embeddings, vocab, 25, unknown_token) embeddings = embeddings.as_matrix() train_x = pad_sequences(train_x, maxlen=25, value=0.) test_x = pad_sequences(test_x, maxlen=25, value=0.) batch_size = 8 # net building net = tflearn.input_data([None, 25]) net = tflearn.embedding(net, input_dim=len(vocab), output_dim=400, trainable=False, name="EmbeddingLayer") net = tflearn.layers.normalization.batch_normalization(net) #print('shape: ', net.shape) #net = tflearn.reshape(net, [None, 25, 400]) print('shape: ', net.shape)
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn2") #simple='simple' vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope="cnn2") if FLAGS.multi_label_flag: FLAGS.traning_data_path = 'training-data/train-zhihu6-title-desc.txt' #test-zhihu5-only-title-multilabel.txt train, test, _ = load_data_multilabel_new( vocabulary_word2index, vocabulary_word2index_label, multi_label_flag=FLAGS.multi_label_flag, traning_data_path=FLAGS.traning_data_path ) #,traning_data_path=FLAGS.traning_data_path trainX, trainY = train testX, testY = test # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training, multi_label_flag=FLAGS.multi_label_flag) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, textCNN, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) #3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end] ) #;print("trainY[start:end]:",trainY[start:end]) feed_dict = { textCNN.input_x: trainX[start:end], textCNN.dropout_keep_prob: 0.5 } if not FLAGS.multi_label_flag: feed_dict[textCNN.input_y] = trainY[start:end] else: feed_dict[textCNN.input_y_multilabel] = trainY[start:end] curr_loss, curr_acc, _ = sess.run( [textCNN.loss_val, textCNN.accuracy, textCNN.train_op], feed_dict) #curr_acc--->TextCNN.accuracy loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 50 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) #tTrain Accuracy:%.3f---》acc/float(counter) #epoch increment print("going to increment epoch counter....") sess.run(textCNN.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, textCNN, testX, testY, batch_size, vocabulary_index2word_label) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, textCNN, testX, testY, batch_size, vocabulary_index2word_label) pass
def test_pad(): trainX = 'w18476 w4454 w1674 w6 w25 w474 w1333 w1467 w863 w6 w4430 w11 w813 w4463 w863 w6 w4430 w111' trainX = trainX.split(" ") trainX = pad_sequences([[trainX]], maxlen=100, value=0.) print("trainX:", trainX)
print("started...") # 1.IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test print("testX.shape:", np.array(testX).shape) #2500个list.每个list代表一句话 print("testY.shape:", np.array(testY).shape) #2500个label print("testX[0]:", testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] print("testY[0]:", testY[0]) #0 # 2.Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length testX = pad_sequences(testX, maxlen=100, value=0.) #padding to max length # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) #y as one hot testY = to_categorical(testY, nb_classes=2) #y as one hot # 3.Building convolutional network #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData") network = input_data( shape=[None, 100], name='input' ) #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training network = tflearn.embedding( network, input_dim=10000, output_dim=128 ) #[None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size #conv_1d(incoming,nb_filter,filter_size) branch1 = conv_1d(
n_datas = 10000 # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=n_datas, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # NOTE: Padding is required for dimension consistency. This will pad sequences # with 0 at the end, until it reaches the max sequence length. 0 is used as a # masking value by dynamic RNNs in TFLearn; a sequence length will be # retrieved by counting non zero elements in a sequence. Then dynamic RNN step # computation is performed according to that length. trainX = pad_sequences(trainX, maxlen=input_length_each_seq, value=0.) testX = pad_sequences(testX, maxlen=input_length_each_seq, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, n_class) testY = to_categorical(testY, n_class) # Network building net = tflearn.input_data([None, input_length_each_seq]) # Masking is not required for embedding, sequence length is computed prior to # the embedding op and assigned as 'seq_length' attribute to the returned Tensor. net = tflearn.embedding(net, input_dim=n_datas, output_dim=hiddle_layes) net = tflearn.lstm(net, hiddle_layes_2, dropout=0.8, dynamic=True) net = tflearn.fully_connected(net, n_class, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( X, Y, test_size=test_ratio, random_state=2017) Y_train = to_categorical(Y_train, nb_classes=len(qualities)) Y_test = to_categorical(Y_test, nb_classes=len(qualities)) ### Process vocabulary print('Process vocabulary') vocab_processor = tflearn.data_utils.VocabularyProcessor( max_document_length=model_size, min_frequency=0) X_train = np.array(list(vocab_processor.fit_transform(X_train))) X_test = np.array(list(vocab_processor.fit_transform(X_test))) X_train = pad_sequences(X_train, maxlen=model_size, value=0.) X_test = pad_sequences(X_test, maxlen=model_size, value=0.) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # pickle.dump (X_train, open ("xtrain.p", b)) # pickle.dump (X_test, open ("xtest.p", b)) # X_train = pickle.load (open ("xtrain.p", rb)) # X_test = pickle.load (open ("xtest.p", rb)) ### Models print('Build model')
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # load training data from cache file. # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network") #simple='simple' vocab_size = len(vocabulary_word2index) print("dynamic_memory_network.vocab_size:",vocab_size) vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network") if FLAGS.multi_label_flag: FLAGS.traning_data_path='../training-data/train-zhihu6-title-desc.txt' #change this line if want to train in a small dataset. e.g. dataset from 'test-zhihu6-title-desc.txt' train,test,_=load_data_multilabel_new(vocabulary_word2index,vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag, traning_data_path=FLAGS.traning_data_path) trainX, trainY = train testX, testY = test print("trainY:",trainY[0:10]) # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model model = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass, use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model,word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch=sess.run(model.epoch_step) #3.feed data & training number_of_training_data=len(trainX) print("number_of_training_data:",number_of_training_data) previous_eval_loss=10000 best_eval_loss=10000 batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end])#;print("trainY[start:end]:",trainY[start:end]) feed_dict = {model.query: trainX[start:end],model.story: np.expand_dims(trainX[start:end],axis=1),model.dropout_keep_prob: 1.0} if not FLAGS.multi_label_flag: feed_dict[model.answer_single] = trainY[start:end] else: feed_dict[model.answer_multilabel]=trainY[start:end] curr_loss,curr_acc,_=sess.run([model.loss_val,model.accuracy,model.train_op],feed_dict) #curr_acc--->TextCNN.accuracy loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc if counter %50==0: print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,math.exp(loss/float(counter)) if (loss/float(counter))<20 else 10000.000,acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter) ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### if FLAGS.batch_size!=0 and (start%(FLAGS.validate_step*FLAGS.batch_size)==0): #(epoch % FLAGS.validate_every) or if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) print("dynamic_memory_network[use_gated_gru=False,num_pass=2].validation.part. previous_eval_loss:", math.exp(previous_eval_loss) if previous_eval_loss<20 else 10000.000,";current_eval_loss:", math.exp(eval_loss) if eval_loss<20 else 10000.000) if eval_loss > previous_eval_loss: #if loss is not decreasing # reduce the learning rate by a factor of 0.5 print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>validation.part.going to reduce the learning rate.") learning_rate1 = sess.run(model.learning_rate) lrr=sess.run([model.learning_rate_decay_half_op]) learning_rate2 = sess.run(model.learning_rate) print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:",learning_rate2) else:# loss is decreasing if eval_loss<best_eval_loss: print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>going to save the model.eval_loss:",math.exp(eval_loss) if eval_loss<20 else 10000.000,";best_eval_loss:",math.exp(best_eval_loss) if best_eval_loss<20 else 10000.000) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) best_eval_loss=eval_loss previous_eval_loss = eval_loss ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### #epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment) # 5.test on test set test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) pass
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer_classification") #simple='simple' vocab_size = len(vocabulary_word2index) print("transformer.vocab_size:",vocab_size) vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="transformer_classification") if FLAGS.multi_label_flag: FLAGS.traning_data_path='training-data/test-zhihu6-title-desc.txt' #one record like this:'w35620 w1097 w111 c278 c150 c150 c285 c278 c43 __label__7756633728210171144 3195914392210930723' train,test,_=load_data_multilabel_new(vocabulary_word2index,vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag, traning_data_path=FLAGS.traning_data_path) trainX, trainY, = train testX, testY = test print("trainY:",trainY[0:10]) # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,l2_lambda=FLAGS.l2_lambda) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model,word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch=sess.run(model.epoch_step) #3.feed data & training number_of_training_data=len(trainX) print("number_of_training_data:",number_of_training_data) previous_eval_loss=10000 best_eval_loss=10000 batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end]) feed_dict = {model.input_x: trainX[start:end],model.dropout_keep_prob: 0.5} feed_dict[model.input_y_label]=trainY[start:end] curr_loss,curr_acc,_=sess.run([model.loss_val,model.accuracy,model.train_op],feed_dict) #curr_acc--->TextCNN.accuracy loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc if counter %50==0: print("transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter) ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### if FLAGS.batch_size!=0 and (start%(FLAGS.validate_step*FLAGS.batch_size)==0): eval_loss, eval_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) print("transformer.classification.validation.part. previous_eval_loss:", previous_eval_loss,";current_eval_loss:",eval_loss) if eval_loss > previous_eval_loss: #if loss is not decreasing # reduce the learning rate by a factor of 0.5 print("transformer.classification.==>validation.part.going to reduce the learning rate.") learning_rate1 = sess.run(model.learning_rate) lrr=sess.run([model.learning_rate_decay_half_op]) learning_rate2 = sess.run(model.learning_rate) print("transformer.classification==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:",learning_rate2) #print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) else:# loss is decreasing if eval_loss<best_eval_loss: print("transformer.classification==>going to save the model.eval_loss:",eval_loss,";best_eval_loss:",best_eval_loss) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) best_eval_loss=eval_loss previous_eval_loss = eval_loss ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### #epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) pass
from tflearn.layers.estimator import regression #Get data trainX, embeddings, trainY, maxLen, POS_labels = data.get_Data_Embeddings() POS_vectors, _ = labelMatrix2OneHot(POS_labels) del data print("TrainX : ", len(trainX)) print("TrainY : ", len(trainY)) print("Embd : ", len(embeddings)) print("POS : ", len(POS_labels)) print("Max Len : ", maxLen) # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=maxLen, value=0) #Converting labels to binary vectors trainY = pad_sequences(trainY, maxlen=maxLen, value=0) embeddings = concat_2Dvectors(embeddings, Flatten_3Dto2D(POS_vectors)) # Network building print("Beginning neural network") net = input_data(shape=[None, maxLen]) net = embedding(net, input_dim=len(embeddings), output_dim=len(embeddings[0]), trainable=False, name="EmbeddingLayer") print("After embeddings : ", net.get_shape().as_list()) net = bidirectional_rnn(net, BasicLSTMCell(1024),
def main(): train_x, train_y, val_x, val_y, test_x, test_y, vocab_size = load_data() label_size = 10 learning_rate = 0.01 batch_size = 128 decay_steps = 20000 decay_rate = 0.8 ckpt_dir = "fast_text_checkpoint/" sentence_len = 200 embed_size = 100 is_training = True num_epochs = 15 validate_every = 1 print("start padding...") train_x = pad_sequences(train_x, maxlen=sentence_len, value = 0) val_x = pad_sequences(val_x, maxlen=sentence_len, value = 0) test_x = pad_sequences(test_x, maxlen=sentence_len, value=0) print("end padding...") config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config = config) as sess: fast_text = fastText(label_size = 10, learning_rate = 0.01, batch_size = 128, decay_step = 20000, decay_rate = 0.8, sentence_len = 200, vocab_size = vocab_size, embed_size = 100, is_training = True) saver = tf.train.Saver() if os.path.exists(ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) curr_epoch = sess.run(fast_text.epoch_step) number_of_training_data = len(train_x) batch_size = batch_size for epoch in range(curr_epoch, num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:",train_x[start:end].shape) print("trainY[start:end]:",train_y[start:end].shape) curr_loss, curr_acc, _ = sess.run([fast_text.loss_val, fast_text.accuracy, fast_text.train_op], feed_dict= \ { fast_text.sentence : train_x[start : end], fast_text.labels : train_y[start : end]} ) loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1 if counter % 500 == 0: print(epoch) print(counter) print(loss) print(acc) print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter))) print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) print(epoch, validate_every, (epoch % validate_every == 0)) if epoch % validate_every == 0: eval_loss, eval_acc = do_eval(sess, fast_text, val_x, val_y, batch_size) print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) # save model to checkpoint save_path = ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=fast_text.epoch_step) # fast_text.epoch_step test_loss, test_acc = do_eval(sess, fast_text, test_x, test_y, batch_size) print("test Loss:%.3f\ttest Accuracy: %.3f" % (test_loss, test_acc)) return
def load_data(traning_data_path, vocab_word2index, vocab_label2index, sentence_len, name_scope, training_portion=0.95, tokenize_style='char'): """ convert data as indexes using word2index dicts. :param traning_data_path: :param vocab_word2index: :param vocab_label2index: :return: """ cache_data_dir = 'cache' + "_" + name_scope # path to save cache cache_file = cache_data_dir + "/" + 'train_valid_test.pik' print("cache_path:", cache_file, "train_valid_test_file_exists:", os.path.exists(cache_file)) if os.path.exists(cache_file): with open(cache_file, 'rb') as data_f: print("going to load cache file from file system and return") return pickle.load(data_f) csvfile = open(traning_data_path, 'r') spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|') label_size = len(vocab_label2index) X1_ = [] X2_ = [] Y_ = [] tfidf_source_file = './data/atec_nl_sim_train.txt' tfidf_target_file = './data/atec_nl_sim_tfidf.txt' if not os.path.exists(tfidf_target_file): get_tfidf_score_and_save(tfidf_source_file, tfidf_target_file) BLUE_SCORES_ = [] word_vec_fasttext_dict = load_word_vec( 'data/fasttext_fin_model_50.vec') #word embedding from fasttxt word_vec_word2vec_dict = load_word_vec( 'data/word2vec.txt') #word embedding from word2vec #word2vec.word2vec('/Users/test/PycharmProjects/question_answering_similarity/data/atec_additional_cropus.txt', # '/Users/test/PycharmProjects/question_answering_similarity/data/word2vec_fin.bin', size=50, verbose=True,kind='txt') #print("word_vec_word2vec_dict:",word_vec_word2vec_dict) tfidf_dict = load_tfidf_dict('data/atec_nl_sim_tfidf.txt') for i, row in enumerate( spamreader ): ##row:['\ufeff1', '\ufeff怎么更改花呗手机号码', '我的花呗是以前的手机号码,怎么更改成现在的支付宝的号码手机号', '1'] x1_list = token_string_as_list(row[1], tokenize_style=tokenize_style) x1 = [vocab_word2index.get(x, UNK_ID) for x in x1_list] x2_list = token_string_as_list(row[2], tokenize_style=tokenize_style) x2 = [vocab_word2index.get(x, UNK_ID) for x in x2_list] #add blue score features 2018-05-06 features_vector = data_mining_features(i, row[1], row[2], vocab_word2index, word_vec_fasttext_dict, word_vec_word2vec_dict, tfidf_dict, n_gram=8) features_vector = [float(x) for x in features_vector] BLUE_SCORES_.append(features_vector) y_ = row[3] y = vocab_label2index[y_] X1_.append(x1) X2_.append(x2) Y_.append(y) if i == 0 or i == 1 or i == 2: print(i, "row[1]:", row[1], ";x1:") print(row[1].decode("utf-8")) print(i, "row[2]:", row[2], ";x2:") print(row[2].decode("utf-8")) print(i, "row[3]:", row[3], ";y:", str(y)) print(i, "row[4].feature vectors:", features_vector) number_examples = len(Y_) #shuffle X1 = [] X2 = [] Y = [] BLUE_SCORES = [] permutation = np.random.permutation(number_examples) for index in permutation: X1.append(X1_[index]) X2.append(X2_[index]) Y.append(Y_[index]) BLUE_SCORES.append(BLUE_SCORES_[index]) X1 = pad_sequences(X1, maxlen=sentence_len, value=0.) # padding to max length X2 = pad_sequences(X2, maxlen=sentence_len, value=0.) # padding to max length valid_number = min(1600, int((1 - training_portion) * number_examples)) test_number = 800 training_number = number_examples - valid_number - test_number valid_end = training_number + valid_number print(";training_number:", training_number, "valid_number:", valid_number, ";test_number:", test_number) #generate more training data, while still keep data distribution for valid and test. X1_final, X2_final, BLUE_SCORE_final, Y_final, training_number_big = get_training_data( X1[0:training_number], X2[0:training_number], BLUE_SCORES[0:training_number], Y[0:training_number], training_number) train = (X1_final, X2_final, BLUE_SCORE_final, Y_final) valid = (X1[training_number + 1:valid_end], X2[training_number + 1:valid_end], BLUE_SCORES[training_number + 1:valid_end], Y[training_number + 1:valid_end]) test = (X1[valid_end + 1:], X2[valid_end:], BLUE_SCORES[valid_end:], Y[valid_end:]) true_label_numbers = len([y for y in Y if y == 1]) true_label_pert = float(true_label_numbers) / float(number_examples) #save train/valid/test/true_label_pert to file system as cache # save to file system if vocabulary of words not exists(pickle). if not os.path.exists(cache_file): with open(cache_file, 'ab') as data_f: print("going to dump train/valid/test data to file sytem.") pickle.dump((train, valid, test, true_label_pert), data_f) return train, valid, test, true_label_pert
def main(_): # load data trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( word2vec_vocabulary_path='../../utils/dump/vocabulary', name_scope="TextCNN") vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( regression_flag=FLAGS.regression_flag, vocabulary_label='../../input/tourist.zh.train.txt', name_scope="TextCNN") # train, test, _ = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label, using_kfold=True, # training_data_path='../../input/tourist.zh.train.txt', # multi_label_flag=FLAGS.multi_label_flag) kf_id = -1 for train_X, train_y, valid_X, valid_y, ID in load_data_multilabel_new( vocabulary_word2index, vocabulary_word2index_label, using_kfold=True, training_data_path='../../input/tourist.zh.train.txt', multi_label_flag=FLAGS.multi_label_flag): kf_id += 1 if kf_id != 4: continue trainX, trainY = train_X, train_y testX, testY = valid_X, valid_y print('hello', ID) # 2. Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("trainX[0]:", trainX[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training, regression_flag=FLAGS.regression_flag, multi_label_flag=FLAGS.multi_label_flag) train_write = tf.summary.FileWriter('log/train_{}'.format(kf_id), sess.graph) test_write = tf.summary.FileWriter('log/test_{}'.format(kf_id)) merged = tf.summary.merge_all() #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + str(kf_id) + "/" + "checkpoint_{}".format(kf_id)): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, textCNN, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) # 3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size index = 0 for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 # 批处理 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print( "trainX[start:end]:", trainX[start:end] ) #;print("trainY[start:end]:",trainY[start:end]) feed_dict = { textCNN.input_x: trainX[start:end], textCNN.dropout_keep_prob: 0.5 } if not FLAGS.multi_label_flag: feed_dict[textCNN.input_y] = trainY[start:end] else: feed_dict[ textCNN.input_y_multilabel] = trainY[start:end] summary, curr_loss, curr_acc, _ = sess.run([ merged, textCNN.loss_val, textCNN.accuracy, textCNN.train_op ], feed_dict) #curr_acc--->TextCNN.accuracy loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 50 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.5f\tTrain Accuracy:%.5f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) #tTrain Accuracy:%.3f---》acc/float(counter) train_write.add_summary(summary, index) index += 1 #epoch increment print("going to increment epoch counter....") sess.run(textCNN.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, merged, test_write, index, textCNN, testX, testY, batch_size, vocabulary_index2word_label) print( "Epoch %d Validation Loss:%.5f\tValidation Accuracy: %.5f" % (epoch, eval_loss, eval_acc)) #save model to checkpoint save_path = FLAGS.ckpt_dir + str( kf_id) + "/" + "model.ckpt" saver.save(sess, save_path, global_step=epoch) # 5. 最后在测试集上做测试,并报告测试准确率 Test # test_loss, test_acc = do_eval(sess, merged,test_write,epoch, textCNN, testX, testY, batch_size, vocabulary_index2word_label) # print("Validation Loss:%.5f\tValidation Accuracy: %.5f" % (test_loss, test_acc)) # 6. 自定义衡量指标 self_acc = _eval(sess, textCNN, testX, testY, vocabulary_index2word_label, ID=ID, kf_id=kf_id, regression_flag=FLAGS.regression_flag, namse_scope='demon') train_write.close() pass
def main(_): X_train, X_val, y_train, y_val, n_classes = train_test_loader() with open('data/vocab.dic', 'rb') as f: vocab = pickle.load(f) vocab_size = len(vocab) + 1 print('size of vocabulary: {}'.format(vocab_size)) # padding sentences X_train = pad_sequences(X_train, maxlen=FLAGS.sentence_len, value=float(vocab_size - 1)) X_val = pad_sequences(X_val, maxlen=FLAGS.sentence_len, value=float(vocab_size - 1)) # convert label to one-hot encode # to_categorical(y_train, n_classes) # to_categorical(y_val, n_classes) # create session config = tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # Instantiate Model textrcnn = TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) # Initialize save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + 'checkpoint'): print('restoring variables from checkpoint') saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: assign_pretrained_word_embedding(sess, vocab, vocab_size, textrcnn) curr_epoch = sess.run(textrcnn.epoch_step) # feed data and training number_of_training_data = len(X_train) batch_size = FLAGS.batch_size best_val_acc = 0.0 for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = .0, .0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 or counter == 0: pass # print('X_train[start:end]: {}'.format(X_train[start:end])) feed_dict = { textrcnn.input_x: X_train[start:end], textrcnn.dropout_keep_prob: 0.5} if not FLAGS.multi_label_flag: feed_dict[textrcnn.input_y] = y_train[start:end] else: feed_dict[textrcnn.input_y_multilabel] = y_train[start:end] curr_loss, curr_acc, _ = sess.run( [textrcnn.loss_val, textrcnn.accuracy, textrcnn.train_op], feed_dict) loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 50 == 0: print('Epoch {}\tBatch {}\tTrain Loss {}\tTrain Accuracy {}'\ .format( epoch, counter, loss / float(counter), acc / float(counter))) print('going to increment epoch counter ...') sess.run(textrcnn.epoch_increment) # validation if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval( sess, textrcnn, X_val, y_val, batch_size) print("Epoch {} Validation Loss: {}\tValidation Accuracy: {}"\ .format(epoch, eval_loss, eval_acc)) if eval_acc > best_val_acc: best_val_acc = eval_acc # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) else: saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) break # report result test_loss, test_acc = do_eval(sess, textrcnn, X_val, y_val, batch_size)
name_scope="cnn2") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope="cnn2") questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True graph = tf.Graph().as_default() global sess global textCNN with graph: sess = tf.Session(config=config) # 4.Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) saver = tf.train.Saver()
""" from __future__ import division, print_function, absolute_import import tensorflow as tf import tflearn from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_1d, global_max_pool, max_pool_1d from tflearn.layers.merge_ops import merge from tflearn.layers.estimator import regression from tflearn.data_utils import to_categorical, pad_sequences from data.data_glass import * trainX, trainY, testX, testY = getGlassData() # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=10, value=0.) testX = pad_sequences(testX, maxlen=10, value=0.) # # # Converting labels to binary vectors trainY = to_categorical(trainY, 6) testY = to_categorical(testY, 6) network = input_data(shape=[None, 10], name='input') network = tflearn.embedding(network, input_dim=1000, output_dim=128) branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch1 = max_pool_1d(branch1, 2) branch2 = conv_1d(network,
from keras.layers import Dense from keras.layers import Embedding from keras.layers import GlobalAveragePooling1D from tflearn.data_utils import to_categorical, pad_sequences data = keras.datasets.imdb # IMDB load dataset train, test= data.load_data(num_words=10000) # valid_portion=0.1 (10%) it hepls preventing overfitting trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX =pad_sequences(trainX, maxlen = 100) # pad_sequences is it converts the each review in matrix and padding it add 0 from all it's border.padding necessary for input consistency and dimentionality. testX = pad_sequences(trainX, maxlen = 100) # max_length =100 iswords of length or we can change to 256 or 200 to increase accuracy. # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) # nb_classes=2 whether it is positive or negative testY = to_categorical(testY, nb_classes=2) # create neural network model = Sequential() model.add(Embedding(10000, 16)) # it embed a value to ex. 0 value when embedding it's like 0:[0.2,0.3,0.4] and it goes to 16 dimension # ex. 7 value when embedding it's like 7:[7,7.3,9] and it goes to 16 dimension model.add(GlobalAveragePooling1D())# it take the average of the embedding layer so it can reduce dimensionality model.add(Dense(16, activation='relu')) model.add(Dense(2,activation='sigmoid'))
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_vocabulary( word2vec_model_path=FLAGS.word2vec_model_path, name_scope="transformer_classification") vocab_size = len(vocabulary_word2index) print("transformer.vocab_size:", vocab_size) train, test, _ = load_data_multilabel_new( vocabulary_word2index, training_data_path=FLAGS.training_data_path) compare_train_data = WikiQA(word2vec=Word2Vec(), max_len=FLAGS.max_len_compare) compare_train_data.open_file(mode="train") compare_test_data = WikiQA(word2vec=Word2Vec(), max_len=FLAGS.max_len_compare) compare_test_data.open_file(mode="valid") trainX, trainY, = train testX, testY = test trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size, FLAGS.d_model, FLAGS.d_k, FLAGS.d_v, FLAGS.h, FLAGS.num_layer, FLAGS.is_training, compare_train_data.num_features, di=50, s=compare_train_data.max_len, w=4, l2_reg=0.0004, l2_lambda=FLAGS.l2_lambda) print("=" * 50) print("List of Variables:") for v in tf.trainable_variables(): print(v.name) print("=" * 50) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, model, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(model.epoch_step) number_of_training_data = len(trainX) print("number_of_training_data:", number_of_training_data) previous_eval_loss = 10000 best_eval_loss = 10000 batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 compare_train_data.reset_index() for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) batch_x1, batch_x2, _, batch_features = compare_train_data.next_batch( batch_size=end - start) feed_dict = { model.input_x: trainX[start:end], model.dropout_keep_prob: 0.9, model.x1: batch_x1, model.x2: batch_x2, model.features: batch_features } feed_dict[model.input_y_label] = trainY[start:end] curr_loss, curr_acc, _ = sess.run( [model.loss_val, model.accuracy, model.train_op], feed_dict) #curr_acc--->TextCNN.accuracy loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 50 == 0: print( "transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) #tTrain Accuracy:%.3f---》acc/float(counter) ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### if FLAGS.batch_size != 0 and ( start % (FLAGS.validate_step * FLAGS.batch_size) == 0): eval_loss, eval_acc = do_eval(sess, model, testX, testY, compare_test_data, batch_size) print( "transformer.classification.validation.part. previous_eval_loss:", previous_eval_loss, ";current_eval_loss:", eval_loss) if eval_loss > previous_eval_loss: #if loss is not decreasing # reduce the learning rate by a factor of 0.5 print( "transformer.classification.==>validation.part.going to reduce the learning rate." ) learning_rate1 = sess.run(model.learning_rate) lrr = sess.run([model.learning_rate_decay_half_op]) learning_rate2 = sess.run(model.learning_rate) print( "transformer.classification==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:", learning_rate2) #print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) else: # loss is decreasing if eval_loss < best_eval_loss: print( "transformer.classification==>going to save the model.eval_loss:", eval_loss, ";best_eval_loss:", best_eval_loss) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) best_eval_loss = eval_loss previous_eval_loss = eval_loss compare_test_data.reset_index() ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### #epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment)
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=2017) Y_train = to_categorical (Y_train, nb_classes = len (qualities)) Y_test = to_categorical (Y_test, nb_classes = len (qualities)) ### Process vocabulary print('Process vocabulary') vocab_processor = tflearn.data_utils.VocabularyProcessor(max_document_length = model_size, min_frequency = 0) X_train = np.array(list(vocab_processor.fit_transform(X_train))) X_test = np.array(list(vocab_processor.fit_transform(X_test))) X_train = pad_sequences(X_train, maxlen=model_size, value=0.) X_test = pad_sequences(X_test, maxlen=model_size, value=0.) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # pickle.dump (X_train, open ("xtrain.p", b)) # pickle.dump (X_test, open ("xtest.p", b)) # X_train = pickle.load (open ("xtrain.p", rb)) # X_test = pickle.load (open ("xtest.p", rb)) ### Models print('Build model')
def load_data(data_path,vocab_role_1st_label2index, vocab_role_2nd_label2index,vocab_func_label2index, vocab_word2index,vocab_char2index,vocab_pos2index, vocab_cap2index,sentence_len,word_len,flag_use_char, flag_use_pos,flag_use_cap): """ :param data_path: :param vocab_role_1st_label2index: :param vocab_role_2nd_label2index: :param vocab_func_label_2index: :param vocab_word2index: :param vocab_cahr2index: :param vocab_pos2index: :param vocab_cap2index: :param sentence_len: :param word_len: :param flag_use_char: :param flag_use_pos: :param flag_use_cap: :return:X: [ word_sequence, char_sequence, pos_sequence, cap_sequence] - word_sequence: sentence_len - char_sequence: sentence_len * word_len - pos_sequence: sentence_len - cap_sequence: sentence_len """ data_file = codecs.open(data_path, mode='r', encoding='utf-8') data_lines = data_file.readlines() random.shuffle(data_lines) # build data samples: link_Y = [] Word_sequences = [] Char_sequences = [] Pos_sequences = [] Cap_sequences = [] role_1st_labels = [] role_2nd_labels = [] func_labels = [] for i, line in enumerate(data_lines): #raw_list = line.strip().split("\t") #print("====="*15) #print(raw_list) #print(raw_list[0]) #print(raw_list[1]) link_index, raw_list = int(i), line raw_list = raw_list.strip().split("__label__") input_list = raw_list[0].strip().split(" ") label_list = raw_list[1].split('|') # get labels link_Y.append(link_index) role_1st_label = vocab_role_1st_label2index[label_list[0]] # print("====="*15) # print(label_list[0]) # print(role_1st_label) # exit() role_2nd_label = vocab_role_2nd_label2index[label_list[1]] func_label = vocab_func_label2index[label_list[2]] role_1st_labels.append(role_1st_label) role_2nd_labels.append(role_2nd_label) func_labels.append(func_label) # get word lists word_sequence = [vocab_word2index.get(x, UNK_ID) for x in input_list] #print(word_sequence) #exit() Word_sequences.append(word_sequence) # get char lists if flag_use_char: char_sequence = [] # [sentence_len, word_len] for word in input_list: char_indexs = [vocab_char2index.get(char, UNK_ID) for char in word] char_sequence.append(char_indexs) if len(input_list) < sentence_len: char_sequence.extend( [[0]] * (sentence_len-len(input_list))) else: char_sequence = char_sequence[:sentence_len] #print(input_list) #print(char_sequence) char_sequence = pad_sequences(char_sequence, maxlen=word_len, value=0.) #print(char_sequence[0]) #print(char_sequence) #exit() Char_sequences.append(char_sequence) if flag_use_pos: pos_sequence = nltk.pos_tag(input_list) # [sentence_len] word_seq, pos_seq = zip(*pos_sequence) pos_sequence = list(pos_seq) pos_sequence = [vocab_pos2index.get(pos, UNK_ID) for pos in pos_sequence] Pos_sequences.append(pos_sequence) if flag_use_cap: cap_sequence = [word_capitalize(word) for word in input_list] cap_sequence = [vocab_cap2index[cap] for cap in cap_sequence] Cap_sequences.append(cap_sequence) Word_sequences = pad_sequences(Word_sequences, maxlen=sentence_len, value=0.) #print(Word_sequences) #exit() if flag_use_pos: Pos_sequences = pad_sequences(Pos_sequences, maxlen=sentence_len, value=0.) if flag_use_cap: Cap_sequences = pad_sequences(Cap_sequences, maxlen=sentence_len, value=0.) X = {'word':np.array(Word_sequences), 'char':np.array(Char_sequences), 'pos':np.array(Pos_sequences), 'cap':np.array(Cap_sequences), 'role_1st':role_1st_labels, 'role_2nd':role_2nd_labels, 'func':func_labels} return (X, np.array(link_Y))
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rcnn") #simple='simple' vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:",vocab_size) vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="rcnn") if FLAGS.multi_label_flag: FLAGS.traning_data_path='training-data/train-zhihu6-title-desc.txt' #test-zhihu5-only-title-multilabel.txt train, test, _ = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag,traning_data_path=FLAGS.traning_data_path) #,traning_data_path=FLAGS.traning_data_path trainX, trainY = train testX, testY = test # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model textRCNN=TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sequence_length, vocab_size,FLAGS.embed_size,FLAGS.is_training,FLAGS.batch_size,multi_label_flag=FLAGS.multi_label_flag) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textRCNN,word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch=sess.run(textRCNN.epoch_step) #3.feed data & training number_of_training_data=len(trainX) batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end])#;print("trainY[start:end]:",trainY[start:end]) feed_dict = {textRCNN.input_x: trainX[start:end],textRCNN.dropout_keep_prob: 0.5} if not FLAGS.multi_label_flag: feed_dict[textRCNN.input_y] = trainY[start:end] else: feed_dict[textRCNN.input_y_multilabel]=trainY[start:end] curr_loss,curr_acc,_=sess.run([textRCNN.loss_val,textRCNN.accuracy,textRCNN.train_op],feed_dict) #curr_acc--->TextCNN.accuracy loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc if counter %50==0: print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter) #epoch increment print("going to increment epoch counter....") sess.run(textRCNN.epoch_increment) # 4.validation print(epoch,FLAGS.validate_every,(epoch % FLAGS.validate_every==0)) if epoch % FLAGS.validate_every==0: eval_loss, eval_acc=do_eval(sess,textRCNN,testX,testY,batch_size,vocabulary_index2word_label) print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_acc)) #save model to checkpoint save_path=FLAGS.ckpt_dir+"model.ckpt" saver.save(sess,save_path,global_step=epoch) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, textRCNN, testX, testY, batch_size,vocabulary_index2word_label) pass