def get_data(train_size, random_seed=100): #randomly shuffle train/test ip_text = read_data('/tmp/data/text.txt') op_text = read_data('/tmp/data/summary.txt') logger.info('Length of text: {}'.format(len(ip_text))) op_text = [ 'sos ' + sent[:-1] + 'eos .' if sent.endswith('.') else 'sos ' + sent + ' eos .' for sent in op_text ] np.random.seed(random_seed) inds = np.arange(len(ip_text)) np.random.shuffle(inds) train_inds = inds[:train_size] test_inds = inds[train_size:] tr_ip_text = [ip_text[ti] for ti in train_inds] tr_op_text = [op_text[ti] for ti in train_inds] ts_ip_text = [ip_text[ti] for ti in test_inds] ts_op_text = [op_text[ti] for ti in test_inds] return tr_ip_text, tr_op_text, ts_ip_text, ts_op_text
def deal_with_data(self): ''' 处理数据,没有可不写。 :return: ''' # 加载数据 self.data = pd.read_csv( os.path.join(DATA_PATH, 'MedicalClass/train.csv')) # 划分训练集、测试集 self.train_data, self.valid_data = train_test_split(self.data, test_size=0.01, random_state=6, shuffle=True) self.text2id, _ = load_dict( os.path.join(DATA_PATH, 'MedicalClass/words_fr.dict')) # self.text2id, _ = load_dict(self.train_data) self.label2id, _ = load_labeldict( os.path.join(DATA_PATH, 'MedicalClass/label.dict')) self.train_text, self.train_label = read_data(self.train_data, self.text2id, self.label2id) self.val_text, self.val_label = read_data(self.valid_data, self.text2id, self.label2id) print('=*=数据处理完成=*=')
def not_first_predict(): ############# reading data ################################################# logger.info("Starting to read testing samples...") test_texts_1, test_texts_2, test_labels = data_helper.read_data( args.test_data_file) test_orig = pd.DataFrame({ "question1": test_texts_1, "question2": test_texts_2 }) logger.info("Finish reading testing samples !") ###################### load a CSV file into DMatrix ###################### # x_test = pd.read_csv(args.x_test_file, header=None, encoding="utf-8", sep="\t") # y_test = pd.read_csv(args.y_test_file, header=None, encoding="utf-8", sep="\t") # d_test = xgb.DMatrix(x_test, y_test) ###################### load a XGBoost binary file into DMatrix ###################### d_test = xgb.DMatrix(args.dtest_file) ###################### do predict ########################### bst = xgb.Booster() # init model bst.load_model(args.model_path) # load model p_test = bst.predict(d_test, ntree_limit=args.ntree_limit) df_sub = pd.DataFrame({ 'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel() }) df_sub.to_csv(args.pred_data_file + str(args.ntree_limit), header=False, index=False, encoding='utf-8', sep="\t", columns=['user_query', 'candidate_query', 'label', 'score'])
import torch.optim as optim import data_helper as dh from data_helper import get_variable from model import * from masked_cross_entropy import * import show as sh import train_helper as th if __name__ == '__main__': data_dir = './data' en_file = "{}/{}".format(data_dir, "seg_en") zh_file = "{}/{}".format(data_dir, "seg_zh") TARGET_MAX_LEN = 25 USE_CUDA = False pairs, input_lang, target_lang = dh.read_data(en_file, zh_file, 20000) # 模型配置 encoder_bidir = False score_method = 'general' hidden_size = 500 n_layers = 2 dropout_p = 0.1 batch_size = 50 # 训练和优化配置 clip = 50.0 teacher_forcing_ratio = 0.5 learning_rate = 0.0001 decoder_learning_ratio = 5.0 n_epochs = 20000
use_teacher_forcing = random.random() < 1 for t in range(max_target_len): #(b,o) output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_outputs) all_decoder_outputs[t] = output # 喂真实lable,应该喂output的结果 if use_teacher_forcing: decoder_input = target_batches[t] else: # 从output中找到两个最符合的单词 words = [] for b in range(batch_size): topv, topi = output[b].data.topk(1) words.append(topi) decoder_input = get_variable(torch.LongTensor(words)) loss = masked_cross_entropy( all_decoder_outputs.transpose(0, 1).contiguous(), target_batches.transpose(0, 1).contiguous(), target_lengths) print(loss) if __name__ == '__main__': data_dir = './data' en_file = "{}/{}".format(data_dir, "seg_en_30000.txt") zh_file = "{}/{}".format(data_dir, "seg_zh_30000.txt") pairs, input_lang, target_lang = helper.read_data(en_file, zh_file, 100) test_model(pairs, input_lang, target_lang)
def train(self): # Prepare data sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\ vocab_slot = data_helper.prepare_data( "data", sentence_training_file, slot_training_file, sentence_developing_file, slot_developing_file, from_vocabulary_size=2000, to_vocabulary_size=2000, tokenizer=None) sentence_developing, slot_devloping = data_helper.read_data( sentence_dev, slot_dev, max_size=None) sentence_training, slot_training = data_helper.read_data( sentence_train, slot_train, max_size=None) ## TODO: #sentence_training, slot_training = sentence_training[:1000],\ # slot_training[:1000] # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot) # For conlleval script words_train = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_training ] labels_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_training ] words_val = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_developing ] labels_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_devloping ] # Define model n_vocab = len(w2id_sentence) n_classes = len(w2id_slot) #model = Sequential() #model.add(Embedding(n_vocab,100)) #model.add(Convolution1D(128, 5, border_mode='same', activation='relu')) #model.add(Dropout(0.25)) #model.add(GRU(100,return_sequences=True)) #model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) #model.compile('rmsprop', 'categorical_crossentropy') ## Training ##n_epochs = 30 #n_epochs = 1 train_f_scores = [] val_f_scores = [] best_val_f1 = 0 #print("Training =>") #train_pred_label = [] #avgLoss = 0 #for i in range(n_epochs): # print("Training epoch {}".format(i)) # bar = progressbar.ProgressBar(max_value=len(sentence_training)) # for n_batch, sent in bar(enumerate(sentence_training)): # label = slot_training[n_batch] # # Make labels one hot # label = np.eye(n_classes)[label][np.newaxis, :] # # View each sentence as a batch # sent = sent[np.newaxis, :] # if sent.shape[1] > 1: #ignore 1 word sentences # loss = model.train_on_batch(sent, label) # avgLoss += loss # pred = model.predict_on_batch(sent) # pred = np.argmax(pred, -1)[0] # train_pred_label.append(pred) # avgLoss = avgLoss/n_batch # predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y)) # for y in train_pred_label] # con_dict = conlleval(predword_train, labels_train, # words_train, 'measure.txt') # train_f_scores.append(con_dict['f1']) # print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( # avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) # # Save model # model.save(filepath_model) # gc.collect() print("Validating =>") from keras.models import load_model model = load_model(filepath_model) labels_pred_val = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(sentence_developing)) for n_batch, sent in bar(enumerate(sentence_developing)): label = slot_devloping[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] if sent.shape[1] > 1: #some bug in keras loss = model.test_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) avgLoss = avgLoss / n_batch gc.collect() predword_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in labels_pred_val ] con_dict = conlleval(predword_val, labels_val, words_val, 'measure.txt') val_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) if con_dict['f1'] > best_val_f1: best_val_f1 = con_dict['f1'] print('here') with open('model_architecture.json', 'w') as outf: outf.write(model.to_json()) model.save_weights('best_model_weights.h5', overwrite=True) print("Best validation F1 score = {}".format(best_val_f1)) print()
def train(): # TODO: Saving DMatrix into a XGBoost binary file will make loading faster ################################################################ ######################### reading data ######################## logger.info("Starting to read training samples...") train_texts_1, train_texts_2, labels = data_helper.read_data(args.train_data_file) val_texts_1, val_texts_2, val_labels = data_helper.read_data(args.valid_data_file) test_texts_1, test_texts_2, test_labels = data_helper.read_data(args.test_data_file) logger.info("Finish reading training samples !") ######################### load csv data ######################## # # TODO: load a CSV file into DMatrix logger.info("Start loading csv.") x_train = pd.read_csv(args.x_train_file, header=None, encoding="utf-8", sep="\t") x_valid = pd.read_csv(args.x_valid_file, header=None, encoding="utf-8", sep="\t") x_test = pd.read_csv(args.x_test_file, header=None, encoding="utf-8", sep="\t") y_train = pd.read_csv(args.y_train_file, header=None, encoding="utf-8", sep="\t") y_valid = pd.read_csv(args.y_valid_file, header=None, encoding="utf-8", sep="\t") # y_test = pd.read_csv(args.y_test_file, header=None, encoding="utf-8", sep="\t") d_train = xgb.DMatrix(x_train, y_train) d_valid = xgb.DMatrix(x_valid, y_valid) logger.info("Done loading csv.") ########################## load DMatrix ######################### # # TODO: load a XGBoost binary file into DMatrix # d_train = xgb.DMatrix(args.dtrain_file) # d_valid = xgb.DMatrix(args.dvalid_file) # d_test = xgb.DMatrix(args.dtest_file) ########################### train models ######################## params = { "booster": args.booster, "eta": args.eta, "gamma": args.gamma, "max_depth": args.max_depth, "min_child_weight": args.min_child_weight, "max_delta_step": args.max_delta_step, "subsample": args.subsample, "colsample_bytree": args.colsample_bytree, "colsample_bylevel": args.colsample_bylevel, "lambda": args.lamda, "alpha": args.alpha, "scale_pos_weight": args.scale_pos_weight, "objective": args.objective, "eval_metric": list(args.eval_metric.split(",")) } watchlist = [(d_train, 'train'), (d_valid, 'valid')] bst = xgb.train(params, d_train, args.num_boost_round, watchlist, early_stopping_rounds=args.early_stopping_rounds) bst.save_model(args.model_path) bst.dump_model(args.model_path + '.dump') ## make the submission p_test = bst.predict(xgb.DMatrix(x_test)) df_sub = pd.DataFrame( {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()}) df_sub.to_csv(args.pred_data_file, header=False, index=False, encoding='utf-8', sep="\t", columns=['user_query', 'candidate_query', 'label', 'score']) ## make the submission for best p_test = bst.predict(xgb.DMatrix(x_test), ntree_limit=bst.best_ntree_limit) df_sub = pd.DataFrame( {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()}) df_sub.to_csv(args.pred_data_file + "_best", header=False, index=False, encoding='utf-8', sep="\t", columns=['user_query', 'candidate_query', 'label', 'score']) logger.info("best_iteration: {}".format(bst.best_iteration)) logger.info("ntree_limit=bst.best_ntree_limit: {}".format(bst.best_ntree_limit)) logger.info("best_score: {}".format(bst.best_score))
def first_train(): # TODO: Saving DMatrix into a XGBoost binary file will make loading faster ####################################################################### logger.info("Starting to read Embedding file...") word2vec = common_function.load_word2vec(args.embedding_file, filter_num=args.embedding_dim) logger.info("Finish reading Embedding file !") logger.info('Found %d word vectors of word2vec' % len(word2vec)) stop_words = common_function.load_file_2_dict(args.stopword_file, colum=1) logger.info("Finish reading stopword file !") logger.info('Stopword is : ' + "|".join(list(stop_words.keys()))) ############################## reading data ############################# logger.info("Starting to read training samples...") train_texts_1, train_texts_2, labels = data_helper.read_data(args.train_data_file) val_texts_1, val_texts_2, val_labels = data_helper.read_data(args.valid_data_file) test_texts_1, test_texts_2, test_labels = data_helper.read_data(args.test_data_file) logger.info("Finish reading training samples !") train_orig = pd.DataFrame({"question1": train_texts_1, "question2": train_texts_2}) val_orig = pd.DataFrame({"question1": val_texts_1, "question2": val_texts_2}) test_orig = pd.DataFrame({"question1": test_texts_1, "question2": test_texts_2}) ############################ save words counts ############################ total_words = [] ques = pd.concat([train_orig, val_orig], axis=0).reset_index(drop='index') for i in range(ques.shape[0]): total_words += ques.question1[i].split('|') total_words += ques.question2[i].split('|') ## save word freq to total_counts counts = Counter(total_words) r = open(args.word_counts_file, 'w', encoding="utf-8") for _word, _count in counts.items(): r.write("%s\t%d\n" % (_word, _count)) r.close() ############################## basic features ############################# train_cp = train_orig.copy() val_cp = val_orig.copy() test_cp = test_orig.copy() x_train_basic = data_helper.get_basic_feat(train_cp, args.embedding_dim, stop_words, word2vec) x_valid_basic = data_helper.get_basic_feat(val_cp, args.embedding_dim, stop_words, word2vec) x_test_basic = data_helper.get_basic_feat(test_cp, args.embedding_dim, stop_words, word2vec) ####################### sentence word char features ######################### weights = {word: data_helper.get_weight(count) for word, count in counts.items()} x_train_more = data_helper.build_features(train_orig, stop_words, weights) x_valid_more = data_helper.build_features(val_orig, stop_words, weights) x_test_more = data_helper.build_features(test_orig, stop_words, weights) ######################## SCWLSTM model simscore ############################# if args.use_scwlstm: x_train_sim = data_helper.model_simscore(args.train_scwlstm_pred_file, train_cp) x_valid_sim = data_helper.model_simscore(args.valid_scwlstm_pred_file, val_cp) x_test_sim = data_helper.model_simscore(args.test_scwlstm_pred_file, test_cp) ################### combine all features ############################## x_train = pd.concat((x_train_basic, x_train_more, x_train_sim), axis=1) x_valid = pd.concat((x_valid_basic, x_valid_more, x_valid_sim), axis=1) x_test = pd.concat((x_test_basic, x_test_more, x_test_sim), axis=1) else: x_train = pd.concat((x_train_basic, x_train_more), axis=1) x_valid = pd.concat((x_valid_basic, x_valid_more), axis=1) x_test = pd.concat((x_test_basic, x_test_more), axis=1) x_train.drop(['question1', 'question2'], axis=1, inplace=True) x_valid.drop(['question1', 'question2'], axis=1, inplace=True) x_test.drop(['question1', 'question2'], axis=1, inplace=True) # print(x_train.columns) features = [x for x in x_train.columns] data_helper.ceate_feature_map(args.feature_map_file, features) x_train.columns = [str(i) for i in range(x_train.shape[1])] x_valid.columns = [str(i) for i in range(x_valid.shape[1])] x_test.columns = [str(i) for i in range(x_test.shape[1])] ################################ save csv ############################### logger.info("Start saving csv.") x_train.to_csv(args.x_train_file, header=False, index=False, encoding="utf-8", sep="\t") x_valid.to_csv(args.x_valid_file, header=False, index=False, encoding="utf-8", sep="\t") x_test.to_csv(args.x_test_file, header=False, index=False, encoding="utf-8", sep="\t") y_train = pd.DataFrame(labels) y_valid = pd.DataFrame(val_labels) y_test = pd.DataFrame(test_labels) y_train.to_csv(args.y_train_file, header=False, index=False, encoding="utf-8", sep="\t") y_valid.to_csv(args.y_valid_file, header=False, index=False, encoding="utf-8", sep="\t") y_test.to_csv(args.y_test_file, header=False, index=False, encoding="utf-8", sep="\t") logger.info("Done saving csv.") ############################# save DMatrix ################################ logger.info("Start saving DMatrix.") y_train = labels y_valid = val_labels y_test = test_labels d_train = xgb.DMatrix(x_train, label=labels) d_valid = xgb.DMatrix(x_valid, label=val_labels) d_test = xgb.DMatrix(x_test, label=test_labels) d_train.save_binary(args.dtrain_file, silent=False) d_valid.save_binary(args.dvalid_file, silent=False) d_test.save_binary(args.dtest_file, silent=False) # logger.info(d_train.feature_names) logger.info("Done saving DMatrix.") ############################# train models ################################# params = { "booster": args.booster, "eta": args.eta, "gamma": args.gamma, "max_depth": args.max_depth, "min_child_weight": args.min_child_weight, "max_delta_step": args.max_delta_step, "subsample": args.subsample, "colsample_bytree": args.colsample_bytree, "colsample_bylevel": args.colsample_bylevel, "lambda": args.lamda, "alpha": args.alpha, "scale_pos_weight": args.scale_pos_weight, "objective": args.objective, "eval_metric": list(args.eval_metric.split(",")) } watchlist = [(d_train, 'train'), (d_valid, 'valid')] bst = xgb.train(params, d_train, args.num_boost_round, watchlist, early_stopping_rounds=args.early_stopping_rounds) bst.save_model(args.model_path) bst.dump_model(args.model_path + '.dump') ## make the submission p_test = bst.predict(xgb.DMatrix(x_test)) df_sub = pd.DataFrame( {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()}) df_sub.to_csv(args.pred_data_file, header=False, index=False, encoding='utf-8', sep="\t", columns=['user_query', 'candidate_query', 'label', 'score']) ## make the submission for best p_test = bst.predict(xgb.DMatrix(x_test), ntree_limit=bst.best_ntree_limit) df_sub = pd.DataFrame( {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()}) df_sub.to_csv(args.pred_data_file + "_best", header=False, index=False, encoding='utf-8', sep="\t", columns=['user_query', 'candidate_query', 'label', 'score']) logger.info("best_iteration: {}".format(bst.best_iteration)) logger.info("ntree_limit=bst.best_ntree_limit: {}".format(bst.best_ntree_limit)) logger.info("best_score: {}".format(bst.best_score))
def first_predict(): # TODO: build features the first time to predict ################################################################# logger.info("Starting to read Embedding file...") word2vec = common_function.load_word2vec(args.embedding_file, filter_num=args.embedding_dim) logger.info("Finish reading Embedding file !") logger.info('Found %d word vectors of word2vec' % len(word2vec)) stop_words = common_function.load_file_2_dict(args.stopword_file, colum=1) logger.info("Finish reading stopword file !") logger.info('Stopword is : ' + "|".join(list(stop_words.keys()))) ############# reading data ################################################# logger.info("Starting to read testing samples...") test_texts_1, test_texts_2, test_labels = data_helper.read_data( args.test_data_file) test_orig = pd.DataFrame({ "question1": test_texts_1, "question2": test_texts_2 }) logger.info("Finish reading testing samples !") ############### read words counts ######################################### counts = common_function.load_file_2_dict(args.word_counts_file) weights = { word: data_helper.get_weight(int(count)) for word, count in counts.items() } ################ make features ######################################## test_cp = test_orig.copy() x_test_basic = data_helper.get_basic_feat(test_cp, args.embedding_dim, stop_words, word2vec) x_test_more = data_helper.build_features(test_orig, stop_words, weights) if args.use_scwlstm: x_test_sim = data_helper.model_simscore(args.test_scwlstm_pred_file, test_cp) ############## combine all features ######################################## x_test = pd.concat([x_test_basic, x_test_more, x_test_sim], axis=1) else: x_test = pd.concat((x_test_basic, x_test_more), axis=1) x_test.drop(['question1', 'question2'], axis=1, inplace=True) x_test.columns = [str(i) for i in range(x_test.shape[1])] ################ save DMatrix binary data to make loading faster ######### xgb.DMatrix(x_test).save_binary('test.buffer') ############## predict models ################################################ bst = xgb.Booster() # init model bst.load_model(args.model_path) # load model p_test = bst.predict(xgb.DMatrix(x_test), ntree_limit=args.ntree_limit) df_sub = pd.DataFrame({ 'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel() }) df_sub.to_csv(args.pred_data_file + str(args.ntree_limit), header=False, index=False, encoding='utf-8', sep="\t", columns=['user_query', 'candidate_query', 'label', 'score'])
tf.flags.DEFINE_integer("decay_steps", 100, "decay steps") tf.flags.DEFINE_integer("decay_steps1", 100, "decay steps") tf.flags.DEFINE_float("decay_rate", 0.1, "decay rate") tf.flags.DEFINE_float("lr", 0.001, "learning rate1") tf.flags.DEFINE_string("cells_sizes", '80,64', "numbers of cells of each layer") tf.flags.DEFINE_integer("display_step", 50, "display_step") tf.flags.DEFINE_integer("save_step", 100, "save_step") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() for attr, value in FLAGS.__flags.items(): print("attr:%s\tvalue:%s" % (attr, str(value))) # load data print("Loading data") starttime = datetime.datetime.now() sequences = data_helper.read_data( os.path.abspath(os.path.join(os.path.curdir, "data")), FLAGS.n_inputs) endtime = datetime.datetime.now() print(str((endtime - starttime).seconds), "seconds") validation_sequence = sequences.validation.sequence.reshape( (-1, FLAGS.n_steps, FLAGS.n_inputs)) test_sequence = sequences.test.sequence.reshape( (-1, FLAGS.n_steps, FLAGS.n_inputs)) validation_iter = math.ceil(validation_sequence.shape[0] / FLAGS.batch_size) print("validation_iter:", validation_iter) test_iter = math.ceil(test_sequence.shape[0] / FLAGS.batch_size) boundaries = [50, 100, 150, 200, 300, 400, 500] learning_rates = [0.001, 0.0001, 0.00001, 0.000001, 0.0000001] #learning_rates2=[0.0001,0.00008,0.00001,0.000008,0.000001] # train
def train(self): sentence_developing, slot_devloping = data_helper.read_data( self.sentence_dev, self.slot_dev, max_size=None) sentence_training, slot_training = data_helper.read_data( self.sentence_train, self.slot_train, max_size=None) # Make toy data; comment this block to train on the full dataset #n_toy = 1000 #sentence_training, slot_training = sentence_training[:n_toy],\ # slot_training[:n_toy] #sentence_developing, slot_devloping = sentence_developing[:round(n_toy/2)],\ # slot_devloping[:round(n_toy/2)] # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( self.vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary( self.vocab_slot) # For conlleval script words_train = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_training ] labels_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_training ] words_val = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_developing ] labels_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_devloping ] # Define model n_vocab = len(w2id_sentence) n_classes = len(w2id_slot) model = Sequential() model.add(Embedding(n_vocab, 100)) model.add(Convolution1D(128, 5, border_mode='same', activation='relu')) model.add(Dropout(0.25)) model.add(GRU(100, return_sequences=True)) model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) model.compile('rmsprop', 'categorical_crossentropy') # Training #n_epochs = 30 n_epochs = 1 train_f_scores = [] val_f_scores = [] best_val_f1 = 0 print("Training =>") train_pred_label = [] avgLoss = 0 for i in range(n_epochs): print("Training epoch {}".format(i)) bar = progressbar.ProgressBar(max_value=len(sentence_training)) for n_batch, sent in bar(enumerate(sentence_training)): label = slot_training[n_batch] # Make labels one hot label = np.eye(n_classes)[label][np.newaxis, :] # View each sentence as a batch sent = sent[np.newaxis, :] if sent.shape[1] > 1: #ignore 1 word sentences loss = model.train_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] train_pred_label.append(pred) avgLoss = avgLoss / n_batch predword_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in train_pred_label ] con_dict = conlleval(predword_train, labels_train, words_train, 'measure.txt') train_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) # Save model model.save(model_file) print("Validating =>") labels_pred_val = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(sentence_developing)) for n_batch, sent in bar(enumerate(sentence_developing)): label = slot_devloping[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] if sent.shape[1] > 1: #some bug in keras loss = model.test_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) avgLoss = avgLoss / n_batch predword_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in labels_pred_val ] con_dict = conlleval(predword_val, labels_val, words_val, 'measure.txt') val_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) if con_dict['f1'] > best_val_f1: best_val_f1 = con_dict['f1'] with open('model_architecture.json', 'w') as outf: outf.write(model.to_json()) model.save_weights('best_model_weights.h5', overwrite=True) print("Best validation F1 score = {}".format(best_val_f1)) print() # Prevent from tensorflow bugs: BaseSession.__del__ gc.collect()
use_teacher_forcing = random.random() < 1 for t in range(max_target_len): #(b,o) output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_outputs) all_decoder_outputs[t] = output # 喂真实lable,应该喂output的结果 if use_teacher_forcing: decoder_input = target_batches[t] else: # 从output中找到两个最符合的单词 words = [] for b in range(batch_size): topv, topi = output[b].data.topk(1) words.append(topi) decoder_input = get_variable(torch.LongTensor(words)) loss = masked_cross_entropy( all_decoder_outputs.transpose(0, 1).contiguous(), target_batches.transpose(0, 1).contiguous(), target_lengths ) print (loss) if __name__ == '__main__': data_dir = './data' en_file = "{}/{}".format(data_dir, "seg_en") zh_file = "{}/{}".format(data_dir, "seg_zh") input_lang, target_lang, pairs = helper.read_data(en_file, zh_file, 20000) test_model(pairs, input_lang, target_lang)
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # =================================== print 'loading data...' data_file = 'train_pivot.csv' product_l, data = data_helper.read_data(data_file) data_size = data.shape[0] product_num = len(product_l) epoch_steps = data_size / FLAGS.batch_size # turn to pandas dataframe product_l = pd.DataFrame(product_l) product_l.columns = ['Producto_ID'] product_l['index1'] = product_l.index with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) sess = tf.Session(config=session_conf) rnn = BimboRNN(FLAGS.batch_size, FLAGS.embedding_size, product_num, FLAGS.hidden_size, 6) with sess.as_default():