def train(self, epoch, merged_sum, writer): #if epoch > 10 and epoch % 5 == 0 and self.lr_init > 0.00025: # self.lr_init = self.lr_init*0.75 # self.lr.assign(self.lr_init).eval() total_loss = 0. i = 0 iterator = data_iterator_len(self.source_data_path, self.target_data_path, read_vocabulary(self.source_vocab_path), read_vocabulary(self.target_vocab_path), self.max_size, self.batch_size) for dsource, slen, dtarget, tlen in iterator: outputs = self.sess.run( [self.loss, self.optim, merged_sum], feed_dict={ self.source: dsource, self.target: dtarget, self.target_len: tlen, self.dropout: self.init_dropout }) loss = outputs[0] itr = self.train_iters * epoch + i total_loss += loss if itr % 2 == 0: writer.add_summary(outputs[-1], itr) if itr % 10 == 0: print( "[Train] [Time: {}] [Epoch: {}] [Iteration: {}] [lr: {}] [Loss: {}] [Perplexity: {}]" .format(datetime.now(), epoch, itr, self.lr_init, loss, np.exp(loss))) sys.stdout.flush() i += 1 self.train_iters = i return total_loss / i
def __init__(self, rng, x, n_hidden): super(GRU, self).__init__() self.minibatch_size = tf.shape(x)[1] self.n_hidden = n_hidden self.x_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) self.y_vocabulary = data.read_vocabulary(data.PUNCT_VOCAB_FILE) self.x_vocabulary_size = len(self.x_vocabulary) self.y_vocabulary_size = len(self.y_vocabulary) # input model self.We = weights_Glorot( self.x_vocabulary_size, n_hidden, 'We', rng) # Share embeddings between forward and backward model self.GRU_f = GRUCell(rng=rng, n_in=n_hidden, n_out=n_hidden, minibatch_size=self.minibatch_size) self.GRU_b = GRUCell(rng=rng, n_in=n_hidden, n_out=n_hidden, minibatch_size=self.minibatch_size) # output model self.GRU = GRUCell(rng=rng, n_in=n_hidden * 2, n_out=n_hidden, minibatch_size=self.minibatch_size) self.Wy = tf.Variable(tf.zeros([n_hidden, self.y_vocabulary_size])) self.by = tf.Variable(tf.zeros([1, self.y_vocabulary_size])) # attention model n_attention = n_hidden * 2 # to match concatenated forward and reverse model states self.Wa_h = weights_Glorot( n_hidden, n_attention, 'Wa_h', rng ) # output model previous hidden state to attention model weights self.Wa_c = weights_Glorot(n_attention, n_attention, 'Wa_c', rng) # contexts to attention model weights self.ba = tf.Variable(tf.zeros([1, n_attention])) self.Wa_y = weights_Glorot(n_attention, 1, 'Wa_y', rng) # gives weights to contexts # Late fusion parameters self.Wf_h = tf.Variable(tf.zeros([n_hidden, n_hidden])) self.Wf_c = tf.Variable(tf.zeros([n_attention, n_hidden])) self.Wf_f = tf.Variable(tf.zeros([n_hidden, n_hidden])) self.bf = tf.Variable(tf.zeros([1, n_hidden])) self.params = [ self.We, self.Wy, self.by, self.Wa_h, self.Wa_c, self.ba, self.Wa_y, self.Wf_h, self.Wf_c, self.Wf_f, self.bf ] self.params += self.GRU.params + self.GRU_f.params + self.GRU_b.params print([x.shape for x in self.params])
def main(_): config = FLAGS if config.dataset == "small": data_config = small elif config.dataset == "medium": data_config = medium elif config.dataset == "debug": data_config = debug else: raise Exception("[!] Unknown dataset {}".format(config.dataset)) #print(data_config.source_data_path) config.source_data_path = data_config.source_data_path config.target_data_path = data_config.target_data_path config.source_vocab_path = data_config.source_vocab_path config.target_vocab_path = data_config.target_vocab_path s_nwords = len(read_vocabulary(config.source_vocab_path)) t_nwords = len(read_vocabulary(config.target_vocab_path)) config.s_nwords = s_nwords config.t_nwords = t_nwords #print("config:", config.__dict__) #print("end") #pp(config.__dict__["__flags"]) gpu_options = tf.GPUOptions(visible_device_list="0") # ckpt_name = "default.epoch0" tf.reset_default_graph() with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # saver = tf.train.import_meta_graph(os.path.join("checkpoints",ckpt_name+".meta")) # saver.restore(sess, os.path.join("checkpoints",ckpt_name)) attn = AttentionNN(config, sess) if config.sample: attn.load() samples = attn.sample(config.sample) print_samples(samples) else: if not config.is_test: attn.load() print("start training!") attn.run(data_config.valid_source_data_path, data_config.valid_target_data_path) else: attn.load() loss = attn.test(data_config.test_source_data_path, data_config.test_target_data_path) print("[Test] [Loss: {}] [Perplexity: {}]".format( loss, np.exp(loss))) samples = attn.sample_in_place( data_config.test_source_data_path, data_config.test_target_data_path) from attention import get_bleu_score get_bleu_score(samples, data_config.test_target_data_path)
def main(_): config = FLAGS if config.dataset == "small": data_config = small elif config.dataset == "medium": data_config = medium elif config.dataset == "debug": data_config = debug else: raise Exception("[!] Unknown dataset {}".format(config.dataset)) config.source_data_path = data_config.source_data_path config.target_data_path = data_config.target_data_path config.source_vocab_path = data_config.source_vocab_path config.target_vocab_path = data_config.target_vocab_path s_nwords = len(read_vocabulary(config.source_vocab_path)) t_nwords = len(read_vocabulary(config.target_vocab_path)) config.s_nwords = s_nwords config.t_nwords = t_nwords pp(config.__dict__["__flags"]) with tf.Session() as sess: attn = AttentionNN(config, sess) if config.sample: attn.load() samples = attn.sample(config.sample) print_samples(samples) else: if not config.is_test: attn.run(data_config.valid_source_data_path, data_config.valid_target_data_path) else: attn.load() loss = attn.test(data_config.test_source_data_path, data_config.test_target_data_path) print("[Test] [Loss: {}] [Perplexity: {}]".format( loss, np.exp(loss))) samples = attn.sample(data_config.test_source_data_path) get_bleu_score(samples, data_config.test_target_data_path)
def test(self, source_data_path, target_data_path): iterator = data_iterator_len(source_data_path, target_data_path, read_vocabulary(self.source_vocab_path), read_vocabulary(self.target_vocab_path), self.max_size, self.batch_size) total_loss = 0 i = 0 for dsource, slen, dtarget, tlen in iterator: loss, = self.sess.run( [self.loss], feed_dict={ self.source: dsource, self.target: dtarget, self.target_len: tlen, self.dropout: 0.0 }) total_loss += loss i += 1 total_loss /= i return total_loss
def sample_in_place(self, source_data_path, target_data_path): source_vocab = read_vocabulary(self.source_vocab_path) target_vocab = read_vocabulary(self.target_vocab_path) inv_target_vocab = {target_vocab[k]: k for k in target_vocab} iterator = data_iterator_len(source_data_path, target_data_path, source_vocab, target_vocab, self.max_size, self.batch_size) samples = [] for dsource, slen, dtarget, tlen in iterator: #dtarget = [[target_vocab["<s>"]] + [target_vocab["<pad>"]]*(self.max_size-1)] #dtarget = dtarget*self.batch_size probs, = self.sess.run([self.probs], feed_dict={ self.source: dsource, self.target: dtarget, self.dropout: 0.0 }) for b in range(self.batch_size): samples.append( [inv_target_vocab[np.argmax(p)] for p in probs[b]]) return samples
def punctuate(text_data): model_file = "Model_models.py_h256_lr0.02.pcl" vocab_len = len(data.read_vocabulary(data.WORD_VOCAB_FILE)) x_len = vocab_len if vocab_len < data.MAX_WORD_VOCABULARY_SIZE else data.MAX_WORD_VOCABULARY_SIZE + data.MIN_WORD_COUNT_IN_VOCAB x = np.ones((x_len, main.MINIBATCH_SIZE)).astype(int) print("Loading model parameters...") net, _ = models.load(model_file, x) print("Building model...") word_vocabulary = net.x_vocabulary punctuation_vocabulary = net.y_vocabulary reverse_punctuation_vocabulary = {v:k for k,v in net.y_vocabulary.items()} text = re.sub('[,?!ред]', '', text_data) # text = text_data punctuated = play_with_model.punctuate(word_vocabulary, punctuation_vocabulary, reverse_punctuation_vocabulary, text, net) return punctuated
if len(sys.argv) > 3: num_hidden = int(sys.argv[3]) else: sys.exit("'Hidden layer size' argument missing!") if len(sys.argv) > 4: initial_learning_rate = float(sys.argv[4]) else: sys.exit("'Learning rate' argument missing!") model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, initial_learning_rate) model_file = model_path + "/" + model_file_name print num_hidden, initial_learning_rate, model_file word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY) x = T.imatrix('x') y = T.imatrix('y') lr = T.scalar('lr') continue_with_previous = False if os.path.isfile(model_file): print "Found an existing model with the name %s" % model_file sys.exit() if continue_with_previous: print "Loading previous model state"
if len(sys.argv) > 2: num_hidden = int(sys.argv[2]) else: sys.exit("'Hidden layer size' argument missing!") if len(sys.argv) > 3: learning_rate = float(sys.argv[3]) else: sys.exit("'Learning rate' argument missing!") model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, learning_rate) print num_hidden, learning_rate, model_file_name word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY) x = T.imatrix('x') y = T.imatrix('y') lr = T.scalar('lr') continue_with_previous = False if os.path.isfile(model_file_name): while True: resp = raw_input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name) resp = resp.lower().strip() if resp not in ('c', 'r', 'e'): continue if resp == 'e':
if len(sys.argv) > 1: model_file = sys.argv[1] else: sys.exit("Model file path argument missing") if len(sys.argv) > 2: input_file = sys.argv[2] else: sys.exit("Input file path argument missing") if len(sys.argv) > 3: output_file = sys.argv[3] else: sys.exit("Output file path argument missing") vocab_len = len(data.read_vocabulary(data.WORD_VOCAB_FILE)) x_len = vocab_len if vocab_len < data.MAX_WORD_VOCABULARY_SIZE else data.MAX_WORD_VOCABULARY_SIZE + data.MIN_WORD_COUNT_IN_VOCAB x = np.ones((x_len, main.MINIBATCH_SIZE)).astype(int) print("Loading model parameters...") net, _ = models.load(model_file, x) print("Building model...") word_vocabulary = net.x_vocabulary punctuation_vocabulary = net.y_vocabulary reverse_word_vocabulary = {v:k for k,v in word_vocabulary.items()} reverse_punctuation_vocabulary = {v:k for k,v in punctuation_vocabulary.items()} with codecs.open(input_file, 'r', 'utf-8') as f:
from keras.layers import Input, Masking, Embedding, GRU, Dense, Activation, Permute, RepeatVector, Bidirectional, Add, Multiply, TimeDistributed, Lambda, Reshape, Dropout, BatchNormalization from keras.models import Model from keras.utils.np_utils import to_categorical from keras.preprocessing.sequence import pad_sequences from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, TerminateOnNaN from keras import backend as K from keras import optimizers sys.path.insert(0, 'local/punctuation') import data EMBEDDING_DIM = 256 # OK? For English vocabulary it is often lower. For my rnnlm it is 1024 MINIBATCH_SIZE = 128 DROP_RATE = 0.2 TIME_STEPS = data.MAX_SEQUENCE_LEN LABEL_DIM = len(data.PUNCTUATION_VOCABULARY) PADDING_VALUE = len(data.read_vocabulary(data.WORD_VOCAB_FILE)) +10 EPOCHS = 10 #pdb.set_trace def get_data(file_name,shuffle): '''Get training and evaluation data''' dataset = data.load(file_name) if shuffle: np.random.shuffle(dataset) x=[seq[0] for seq in dataset] y=[seq[1] for seq in dataset] #tokenizedLabels = to_categorical(np.asarray(y),num_classes=LABEL_DIM) #return np.asarray(x).reshape((len(x),data.MAX_SEQUENCE_LEN,1)), to_categorical(np.asarray(y),num_classes=LABEL_DIM) #np.asarray(y) return np.asarray(x), to_categorical(np.asarray(y),num_classes=LABEL_DIM) #np.asarray(y) def perplexity(y_true, y_pred):
# -*- coding: utf-8 -*- import numpy as np from model import Seq2Seq import data as data_util ddir = '../data/' ckpt = '../ckpt/' trainX = np.load(ddir + 'trainX.npy') trainY = np.load(ddir + 'trainY.npy') testX = np.load(ddir + 'testX.npy') testY = np.load(ddir + 'testY.npy') validX = np.load(ddir + 'validX.npy') validY = np.load(ddir + 'validY.npy') vocabulary = data_util.read_vocabulary(ddir) word_to_num = dict(zip(vocabulary, range(len(vocabulary)))) # {".":0,",":1,"不":2,"人":3} num_to_word = dict(zip(range(len(vocabulary)), vocabulary)) xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] print("xseq_len = ", xseq_len, "yseq_len = ", yseq_len) batch_size = 64 # xvocab_size = len(metadata['idx2w']) xvocab_size = len(vocabulary) yvocab_size = xvocab_size emb_dim = 1024 model = Seq2Seq(xseq_len=xseq_len,
def compute_error(target_paths, predicted_paths): punctuation_vocabulary = data.read_vocabulary(data.PUNCT_VOCAB_FILE) counter = 0 total_correct = 0 correct = 0. substitutions = 0. deletions = 0. insertions = 0. true_positives = {} false_positives = {} false_negatives = {} for target_path, predicted_path in zip(target_paths, predicted_paths): target_punctuation = " " predicted_punctuation = " " t_i = 0 p_i = 0 with codecs.open(target_path, 'r', 'utf-8') as target, codecs.open(predicted_path, 'r', 'utf-8') as predicted: target_stream = target.read().split() predicted_stream = predicted.read().split() while True: if data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in punctuation_vocabulary: while data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in punctuation_vocabulary: # skip multiple consecutive punctuations target_punctuation = data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) target_punctuation = MAPPING.get(target_punctuation, target_punctuation) t_i += 1 else: target_punctuation = " " if predicted_stream[p_i] in punctuation_vocabulary: predicted_punctuation = MAPPING.get(predicted_stream[p_i], predicted_stream[p_i]) p_i += 1 else: predicted_punctuation = " " is_correct = target_punctuation == predicted_punctuation counter += 1 total_correct += is_correct if predicted_punctuation == " " and target_punctuation != " ": deletions += 1 elif predicted_punctuation != " " and target_punctuation == " ": insertions += 1 elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation == target_punctuation: correct += 1 elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation != target_punctuation: substitutions += 1 true_positives[target_punctuation] = true_positives.get(target_punctuation, 0.) + float(is_correct) false_positives[predicted_punctuation] = false_positives.get(predicted_punctuation, 0.) + float(not is_correct) false_negatives[target_punctuation] = false_negatives.get(target_punctuation, 0.) + float(not is_correct) assert target_stream[t_i] == predicted_stream[p_i] or predicted_stream[p_i] == "<unk>", \ ("File: %s \n" + \ "Error: %s (%s) != %s (%s) \n" + \ "Target context: %s \n" + \ "Predicted context: %s") % \ (target_path, target_stream[t_i], t_i, predicted_stream[p_i], p_i, " ".join(target_stream[t_i-2:t_i+2]), " ".join(predicted_stream[p_i-2:p_i+2])) t_i += 1 p_i += 1 if t_i >= len(target_stream)-1 and p_i >= len(predicted_stream)-1: break overall_tp = 0.0 overall_fp = 0.0 overall_fn = 0.0 print("-"*46) print("{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE')) for p in punctuation_vocabulary: if p == data.SPACE: continue overall_tp += true_positives.get(p,0.) overall_fp += false_positives.get(p,0.) overall_fn += false_negatives.get(p,0.) punctuation = p precision = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_positives[p])) if p in false_positives else nan recall = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_negatives[p])) if p in false_negatives else nan f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan print("{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100)) print("-"*46) pre = overall_tp/(overall_tp+overall_fp) if overall_fp else nan rec = overall_tp/(overall_tp+overall_fn) if overall_fn else nan f1 = (2.*pre*rec)/(pre+rec) if (pre + rec) else nan print("{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100)) print("Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2)) print("SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1))
def trainModel(model, xTrain, yTrain, xVal, yVal, num_gpus, model_file, logdir, callbacks=None, verbose=False): sys.stderr.write("Training" + "\n") #From https://github.com/flomlo/ntm_keras/blob/master/testing_utils.py tensorboard = TensorBoard(log_dir=logdir, batch_size=MINIBATCH_SIZE, histogram_freq=1, write_grads=True, write_images=True) #, embeddings_freq=1, embeddings_layer_names='embedding', embeddings_metadata=model_path + '/logs' + 'metadata.tsv', embeddings_data=xTrain) checkpoint = ModelCheckpoint(logdir + "/model.ckpt.{epoch:04d}.hdf5", monitor='val_loss', verbose=1, save_best_only=True, period=1) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=1) cbs = [tensorboard, early_stopping, checkpoint] # TerminateOnNaN, if verbose: for i in range(0, EPOCHS): model.fit(xTrain, yTrain, validation_data=(xVal, yVal), epochs=i+1, batch_size=MINIBATCH_SIZE, callbacks=cbs, initial_epoch=i) print("currently at epoch {0}".format(i+1)) # Some test function model.save(model_file + '{epoch:02d}.hdf5') else: for i in range(num_gpus): try: os.environ["CUDA_VISIBLE_DEVICES"]=str(i) model.fit(xTrain, yTrain, validation_data=(xVal, yVal), epochs=EPOCHS, batch_size=MINIBATCH_SIZE, callbacks=cbs) break except InternalError: print("GPU {0} is not available".format(str(i))) model.save(model_file + '.hdf5') return model if __name__ == "__main__": if len(sys.argv) > 1: model_path = os.path.abspath(sys.argv[1]) else: sys.exit("'Model path' argument missing!") if len(sys.argv) > 2: model_name = sys.argv[2] else: sys.exit("'Model name' argument missing!") if len(sys.argv) > 3: num_hidden = int(sys.argv[3]) else: sys.exit("'Hidden layer size' argument missing!") if len(sys.argv) > 4: lr = float(sys.argv[4]) else: sys.exit("'Learning rate' argument missing!") model_file_name = "Model_%s_h%d_lr%s" % (model_name, num_hidden, lr) model_file = model_path + "/" + model_file_name logdir = model_path + "/logs/" + model_file_name print(num_hidden, lr, model_file) word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY) #print(punctuation_vocabulary) continue_with_previous = False if os.path.isfile(model_file): print("Found an existing model with the name %s" % model_file) sys.exit() import subprocess, re gpu_info = subprocess.check_output(('lspci')) num_gpus = len(re.findall('VGA compatible controller: NVIDIA Corporation', str(gpu_info), flags=0)) #print("train file: ",data.TRAIN_FILE) xTrain, yTrain = get_data(data.TRAIN_FILE,True) xVal, yVal = get_data(data.DEV_FILE,False) #print('Shape of data tensor:', xTrain.shape) #print('Shape of label tensor:', yTrain.shape) #print('xTrain.shape[0]: ', xTrain.shape[0]) # with open('xTrain.tmp','w',encoding='utf-8') as fout: # print(xTrain, file=fout) # with open('yTrain.tmp','w',encoding='utf-8') as fout2: # print(yTrain, file=fout2) # print('xTrain: ', xTrain[:5]) # print('yTrain: ', yTrain[:5]) # print('xVal: ', xVal[:5]) # print('yVal: ', yVal[:5]) model = createModel(num_hidden, lr) trainModel(model, xTrain, yTrain, xVal, yVal, num_gpus, model_file, logdir)