def word2vec_trained_embeddings(self): helper._print_header('Getting word2vec trained on Enron corpus...') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) documents = self.get_enron_sentences() model_logger = Word2VecLogger() if os.path.isfile(FLAGS.word2vec_dir + 'word2vec.model'): helper._print_subheader('Loading previously trained model...') model = KeyedVectors.load(FLAGS.word2vec_dir + 'word2vec.model') else: helper._print_subheader('Building model...') model = Word2Vec( documents, size=300, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=10, min_count=3, workers=10, iter=1) helper._print_subheader('Saving untrained model...') model.save(FLAGS.word2vec_dir + 'word2vec.model') model.train(documents, total_examples=len(documents), epochs=FLAGS.word2vec_trained_mode_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') model.save(FLAGS.word2vec_dir + 'trained_word2vec.model') return self.word2vec_index_keyed_vector(model.wv)
def glove_finetuned_embeddings(self): helper._print_header('Getting fine-tuned GloVe embeddings') self.glove_download_pretrained_model() sentences = self.get_enron_sentences() vocab = helper.get_or_build(FLAGS.enron_emails_vocab_path, self.build_vocab, sentences) # idx2word = {i: word for word, i in word2idx.items()} print(len(vocab)) cooccur = helper.get_or_build(FLAGS.enron_emails_cooccur_path, self.build_cooccur, vocab, sentences, type='numpy') print(np.shape(cooccur)) pretrained_embeddings = self.glove2dict(self.word_embed_file_path) helper._print_subheader('Starting Mittens model...') mittens_model = Mittens(n=self.dimensions, max_iter=1000, display_progress=1, log_dir=FLAGS.glove_dir + 'mittens/') finetuned_embeddings = mittens_model.fit( cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings) print(finetuned_embeddings) return 'test', 'test', 'test'
def build_trained_embeddings(self): helper._print_header('Getting word2vec trained on Enron corpus...') if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) sentences = self.get_enron_sentences() model_logger = Word2VecLogger() path = directories.WORD2VEC_DIR + 'trained_word2vec.model' if os.path.isfile(path): helper._print('Loading previously trained model...') word2vec_model = KeyedVectors.load(path) else: helper._print_subheader('Building model...') word2vec_model = gensim.models.Word2Vec( sentences, size=FLAGS.word_embedding_size, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=FLAGS.word2vec_window, min_count=FLAGS.word2vec_min_count, workers=10, iter=1 ) pool = multiprocessing.Pool() word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger]) # word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs) helper._print(f'Saving model to {path}') word2vec_model.save(path) vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=word2vec_model.wv, vocab=vocab)
def build_pretrained_embeddings(self): helper._print_header('Getting pretrained GloVe embeddings') self.glove_download_pretrained_model() sentences = self.get_enron_sentences() vocab = self.build_vocab(sentences) return self.generate_indexes(vocab, directories.GLOVE_EMBEDDING_FILE_PATH)
def __init__(self): helper._print_header(f"Loading tree data ({FLAGS.dataset})") self.train_trees = tree_util.parse_trees(dataset=FLAGS.dataset) self.test_trees = tree_util.parse_trees(dataset=FLAGS.dataset, type='test') self.val_trees = tree_util.parse_trees(dataset=FLAGS.dataset, type="val") self.make_tree_text_file()
def build_pretrained_embeddings(self): helper._print_header('Getting pretrained fastText embeddings') if self.dimensions != 300: raise NotImplementedError( 'Only word vectors of size 300 are available at this point.') self.download_fastText_vectors() sentences = self.get_enron_sentences() vocab = self.build_vocab(sentences) return self.generate_indexes(vocab, directories.FASTTEXT_EMBEDDING_FILE_PATH)
def print_performance(self): helper._print_header("Final stats for best model") helper._print("Best epoch:", self.speed["best_epoch"]) helper._print("Total running time:", str(int(self.speed["best_time"] / (60 * 60))) + "h", str((int(self.speed["best_time"] / 60) % 60)) + "m") helper._print("Total epochs:", self.speed["epoch"]) helper._print("Total running time:", str(int(self.speed["total_time"] / (60 * 60))) + "h", str((int(self.speed["total_time"] / 60) % 60)) + "m") helper._print("Test:", self.performance_test) helper._print("Val:", self.performance_val) helper._print("train:", self.performance_train)
def glove_download_pretrained_model(self): self.word_embed_file_path = FLAGS.glove_dir + self.embedding_file + '.' + str( self.dimensions) + 'd.txt' self.glove_zip_path = FLAGS.glove_dir + 'glove.zip' if not os.path.isdir(FLAGS.glove_dir): os.makedirs(FLAGS.glove_dir) if not os.path.isfile(self.word_embed_file_path): helper._print_header('Downloading GloVe embedding: {0}'.format( self.embedding_file)) url = 'http://nlp.stanford.edu/data/wordvecs/' + self.embedding_file + '.zip' helper.download(url, self.glove_zip_path) with zipfile.ZipFile(self.glove_zip_path, 'r') as zip: helper._print_header( f'Extracting glove weights from {self.glove_zip_path} ') zip.extractall(path=FLAGS.glove_dir)
def glove_download_pretrained_model(self): if not os.path.isdir(directories.GLOVE_DIR): os.makedirs(directories.GLOVE_DIR) if not os.path.isfile(directories.GLOVE_EMBEDDING_FILE_PATH): helper._print_header('Downloading GloVe embedding: {0}'.format( directories.GLOVE_EMBEDDING_FILE_NAME)) url = constants.GLOVE_URL + directories.GLOVE_EMBEDDING_FILE_NAME + '.zip' print(url) helper.download(url, directories.GLOVE_EMBEDDING_ZIP_PATH) with zipfile.ZipFile(directories.GLOVE_EMBEDDING_ZIP_PATH, 'r') as zip: helper._print_header( f'Extracting glove weights from {directories.GLOVE_EMBEDDING_ZIP_PATH} ' ) zip.extractall(path=directories.GLOVE_DIR)
def build_finetuned_embeddings(self): helper._print_header('Getting fine-tuned word2vec embeddings') path = directories.WORD2VEC_DIR + 'finetuned_word2vec.model' pretrained_path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH sentences = self.get_enron_sentences() if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) if os.path.isfile(path): helper._print_subheader('Loading previously fine-tuned model...') finetuned_model = {} finetuned_model.wv = KeyedVectors.load(path) else: if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') sys.exit() if not os.path.isfile(pretrained_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM') sys.exit() helper._print_subheader('Unpacking ' + pretrained_path) model = KeyedVectors.load_word2vec_format(pretrained_path, binary=True) helper._print_subheader('Done unpacking!') finetuned_model = gensim.models.Word2Vec( size=FLAGS.word_embedding_size, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=FLAGS.word2vec_window, min_count=FLAGS.word2vec_min_count, workers=10, iter=1 ) helper._print_subheader('Building fine-tuned model vocab...') finetuned_model.build_vocab(sentences) helper._print_subheader('Updating with pretrained model vocab...') finetuned_model.build_vocab([list(model.vocab.keys())], update=True) helper._print_subheader('Intersection with pretrained vectors...') finetuned_model.intersect_word2vec_format(pretrained_path, binary=True, lockf=1.0) model_logger = Word2VecLogger() finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') finetuned_model.save(path) vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=finetuned_model.wv, vocab=vocab)
def build_pretrained_embeddings(self): helper._print_header('Getting pretrained word2vec embeddings') path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH sentences = self.get_enron_sentences() if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') if not os.path.isfile(path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM') sys.exit() else: helper._print_subheader('Unpacking ' + path) model = KeyedVectors.load_word2vec_format(path, binary=True) helper._print_subheader('Done unpacking!') vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=model, vocab=vocab)
def load_enron_txt_data(self): helper._print_header("Loading Enron emails") try: if os.name == 'nt': """ Using sys.maxsize throws an Overflow error on Windows 64-bit platforms since internal representation of 'int'/'long' on Win64 is only 32-bit wide. Ideally limit on Win64 should not exceed ((2**31)-1) as long as internal representation uses 'int' and/or 'long' """ csv.field_size_limit((2**31) - 1) else: csv.field_size_limit(sys.maxsize) except OverflowError as e: # skip setting the limit for now pass if not os.path.isfile(directories.ENRON_EMAILS_CSV_PATH): data = 'wcukierski/enron-email-dataset' helper._print_subheader(f'Downloading enron emails from Kaggle') helper.download_from_kaggle(data, directories.ENRON_DIR) helper._print_subheader('Download finished! Unzipping...') with zipfile.ZipFile(directories.ENRON_EMAILS_ZIP_PATH, 'r') as zip: zip.extractall(path=directories.ENRON_DIR) if not os.path.isfile(directories.ENRON_EMAILS_TXT_PATH): helper._print_subheader('Processing emails into .txt file!') with open(directories.ENRON_EMAILS_CSV_PATH, 'r', encoding='utf-8') as emails_csv: with open(directories.ENRON_EMAILS_TXT_PATH, 'w', encoding='utf-8') as text_file: email_reader = csv.reader(emails_csv, delimiter=",") for index, row in enumerate(email_reader): if index == 0: continue sentences = nltk.sent_tokenize( self.format_email_body(row)) for sent in sentences: if len(sent.split(' ')) > 2: text_file.write(sent + '\n') if index % 100000 == 0 and index != 0: helper._print(f'{index} emails processed') helper._print_subheader('Enron email data loaded!')
def word2vec_finetuned_embeddings(self): helper._print_header('Getting fine-tuned word2vec embeddings') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) if os.path.isfile(FLAGS.word2vec_dir + 'finetuned_word2vec.model'): helper._print_subheader('Loading previously fine-tuned model...') finetuned_model = {} finetuned_model.wv = KeyedVectors.load(FLAGS.word2vec_dir + 'word2vec.model') else: if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') sys.exit() binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin' if not os.path.isfile(binary_file_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM' ) sys.exit() helper._print_subheader('Unpacking ' + binary_file_path) model = KeyedVectors.load_word2vec_format(binary_file_path, binary=True) helper._print_subheader('Done unpacking!') sentences = self.get_enron_sentences() finetuned_model = Word2Vec(size=300, min_count=3) helper._print_subheader('Building fine-tuned model vocab...') finetuned_model.build_vocab(sentences) helper._print_subheader('Updating with pretrained model vocab...') finetuned_model.build_vocab([list(model.vocab.keys())], update=True) helper._print_subheader('Intersection with pretrained vectors...') finetuned_model.intersect_word2vec_format(binary_file_path, binary=True, lockf=1.0) model_logger = Word2VecLogger() finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_finetuned_mode_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') model.save(FLAGS.word2vec_dir + 'finetuned_word2vec.model') return self.word2vec_index_keyed_vector(finetuned_model.wv)
def word2vec_pretrained_embeddings(self): helper._print_header('Getting pretrained word2vec embeddings') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) self.word_embed_file_path = FLAGS.word2vec_dir + self.embedding_file + '.txt' if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') if not os.path.isfile(self.word_embed_file_path): binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin' if not os.path.isfile(binary_file_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM' ) sys.exit() else: helper._print_subheader('Unpacking ' + binary_file_path) model = KeyedVectors.load_word2vec_format(binary_file_path, binary=True) helper._print_subheader('Done unpacking!') return self.word2vec_index_keyed_vector(model)
def __init__(self, model, sess, saver, summary, load, gpu, batch_size=FLAGS.batch_size, epochs=FLAGS.epochs): helper._print_header("Training " + model.model_name) helper._print("Load model:", load) helper._print("Model:", model.__class__.__name__) helper._print("Use GPU:", gpu) helper._print("Test ration:", tree_util.ratio_of_labels(model.data.test_trees)) helper._print("Validation ration:", tree_util.ratio_of_labels(model.data.val_trees)) helper._print("Train ration:", tree_util.ratio_of_labels(model.data.train_trees)) helper._print("Batch size:", batch_size) helper._print("Max epochs:", epochs) helper._print("Convergence epochs:", FLAGS.conv_cond) helper._print("Max pre-training epochs:", FLAGS.pretrain_max_epoch) helper._print("L2-Regularization Scalar", FLAGS.l2_scalar) helper._print("Dropout probability:", FLAGS.dropout_prob) helper._print("Representation Size:", FLAGS.sentence_embedding_size) helper._print( "Word embedding model:", FLAGS.word_embed_model + ' (' + FLAGS.word_embed_mode + ')') helper._print("Use root loss:", FLAGS.use_root_loss) helper._print("Use selective training:", FLAGS.use_selective_training) helper._print("Decay for every step:", FLAGS.lr_decay) helper._print("Learning rate start:", FLAGS.learning_rate) self.model = model self.batch_size = batch_size self.sess = sess self.saver = saver self.summary = summary
def train_classifier(): data = get_data() classifier = Sequential() classifier.add( Dense(100, activation=tf.nn.relu, input_shape=(FLAGS.sentence_embedding_size, ))) for i in range(1 - 1): classifier.add( Dense(100, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.3))) classifier.add(Dropout(0.5)) classifier.add(Dense(2, activation='softmax')) classifier.compile(optimizer=Adagrad(0.01), loss='categorical_crossentropy', metrics=['accuracy', Recall(), Precision(), f1]) classifier.summary() helper._print_header('Training classifier') classifier.fit(data['train'][0], data['train'][1], batch_size=FLAGS.classifier_batch_size, validation_data=(data['val'][0], data['val'][1]), epochs=200, callbacks=[ EarlyStopping(monitor='val_accuracy', patience=25, min_delta=0.01), SaveBestModelCallback() ], verbose=2)
def build_model(self): helper._print_header("Constructing tRNN structure") # phrase node tensors rep_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) o_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) helper._print_header("Building tRNN tree structure") # build the tRNN structure def embed_word(word_index): return tf.nn.embedding_lookup(self.embeddings, word_index) def build_node(left_child, right_child, rep_array): left_is_leaf = tf.gather(self.is_leaf_array, left_child) right_is_leaf = tf.gather(self.is_leaf_array, right_child) # reshape from vector to matrix with height 300 and width 1 rep_l = tf.reshape(rep_array.read(left_child), [300, 1]) rep_r = tf.reshape(rep_array.read(right_child), [300, 1]) left = tf.cond(left_is_leaf, lambda: tf.matmul(self.W_l, rep_l) + self.b_W, lambda: tf.matmul(self.U_l, rep_l) + self.b_U) right = tf.cond(right_is_leaf, lambda: tf.matmul(self.W_r, rep_r) + self.b_W, lambda: tf.matmul(self.U_r, rep_r) + self.b_U) # relu( (sent_size , 1) + (sent_size , 1) + (sent_size , 1) ) = (sent_size , 1) return tf.nn.leaky_relu(left + right) def tree_construction_body(rep_array, o_array, i): # gather variables is_leaf = tf.gather(self.is_leaf_array, i) word_index = tf.gather(self.word_index_array, i) left_child = tf.gather(self.left_child_array, i) right_child = tf.gather(self.right_child_array, i) # embed_word = (word_size, 1) # build_node = (sent_size , 1) rep = tf.cond( is_leaf, lambda: embed_word(word_index), lambda: build_node(left_child, right_child, rep_array)) if FLAGS.dropout_prob > 0: rep = tf.nn.dropout(rep, rate=self.dropout_rate) rep_array = rep_array.write(i, rep) # o_none = (label_size, 1) # softmax( (label_size, sent_size) * (sent_size, 1) + (label_size, 1)) = (label_size, 1) o = tf.cond( is_leaf, lambda: self.o_none, lambda: tf.matmul(self.V, rep) + self. b_p # TODO maybe with out activation function ) o_array = o_array.write(i, o) i = tf.add(i, 1) return rep_array, o_array, i termination_cond = lambda rep_a, o_a, i: tf.less( i, tf.squeeze(tf.shape(self.is_leaf_array))) tf.print('hello', (self.is_leaf_array), output_stream=sys.stderr) self.rep_array, self.o_array, _ = tf.while_loop( cond=termination_cond, body=tree_construction_body, loop_vars=(rep_array, o_array, 0), parallel_iterations=1)
def build_trained_embeddings(self): helper._print_header('Getting trained GloVe embeddings') vocab, _, _ = self.train_and_save_embeddings() return self.generate_indexes( vocab, directories.TRAINED_GLOVE_EMBEDDING_FILE_PATH)
def glove_pretrained_embeddings(self): helper._print_header('Getting pretrained GloVe embeddings') self.glove_download_pretrained_model() return self.glove_generate_indexes()
def selective_train(model, load=False, gpu=True, batch_size=FLAGS.batch_size, epochs=FLAGS.epochs, run_times=[], epoch_times=[], conv_cond=FLAGS.conv_cond, num_threads=FLAGS.num_threads): if gpu: config = None else: config = tf.ConfigProto(device_count={'GPU': 0}) with tf.Session(config=config) as sess: # initialization saver = tf.train.Saver() selector = Selector(model, sess, FLAGS.num_clusters, FLAGS.cluster_model) summary = summarizer(model.model_name, sess) summary.construct_dir() trainer = Trainer(model, sess, saver, summary, load=load, gpu=gpu, batch_size=batch_size) if load: model.load_tmp(sess, saver) summary.load() else: model.initialize(sess) summary.initialize() summary.save_parameters( lr=model.learning_rate, lr_end=model.learning_rate_end, gpu=gpu, lr_decay=model.lr_decay, conv_cond=conv_cond, model=model.__class__.__name__, number_variables=model.get_no_trainable_variables(), max_epochs=epochs, optimizer=model.optimizer) # Pre-training train_data, val_data, test_data = model.data.train_trees, model.data.val_trees, model.data.test_trees while not summary.dropping() and not summary.interrupt(): summary.epoch_inc() helper._print_subheader( f'Epoch {summary.get_epoch()} (Pre-training)') trainer.train(train_data) summary.compute(summary.VAL, data=model.data.val_trees, model=model, _print=True) summary.save_history() summary.time_tick() if summary.new_best_acc(summary.VAL): helper._print("New best val model found!") model.save_best(sess, saver, summary.VAL) if summary.new_best_acc(summary.TRAIN): helper._print("New best train model found!") model.save_best(sess, saver, summary.TRAIN) else: helper._print( "No new best model found!!! Prev best training acc:", summary.best_acc[summary.TRAIN]) # summary.dropping_tick() summary.save_speed() summary.pre_tick() # todo maybe allow multiple repeat selective training # Selecting helper._print_header('PRETRAINING ENDED!') model.load_best(sess, saver, summary.TRAIN) model.save_pre_end(sess, saver, summary.TRAIN) # Main training first = True while not summary.converging() and not summary.interrupt(): summary.main_count_tick() if summary.re_cluster(): # if main_count == 1 or (FLAGS.use_multi_cluster and main_count % int(FLAGS.pretrain_max_epoch/4)==0): helper._print_header(f'Clustering for MAIN TRAINING!') train_data_selection, cluster_predictions = selector.select_data( model.data.train_trees, FLAGS.selection_cut_off) summary.save_cluster_predictions(cluster_predictions) summary.time_tick("Selection time:") elif first and FLAGS.load_model: cluster_predictions = summary.load_cluster_predictions() train_data_selection, cluster_predictions = selector.select_data( model.data.train_trees, FLAGS.selection_cut_off, cluster_predictions=cluster_predictions) first = False summary.epoch_inc() helper._print_subheader( f'Epoch {summary.get_epoch()} (Main training)') helper._print( f'Using {len(train_data_selection)}/{len(train_data)} ({len(train_data_selection)/len(train_data)*100}%) for training data.' ) trainer.train(train_data_selection) summary.compute(summary.VAL, data=model.data.val_trees, model=model, _print=True) summary.save_history() summary.time_tick() if summary.new_best_acc(summary.VAL): helper._print("New best model found!") model.save_best(sess, saver, summary.VAL) else: helper._print( "No new best model found!!! Prev best validation acc:", summary.best_acc[summary.VAL]) summary.converging_tick() summary.save_speed() model.load_best(sess, saver, summary.VAL) summary.save_performance(model) summary.print_performance()
def __init__(self, data): """ :param data: utils.data """ helper._print_header("Constructing treeRNN constants, placeholders and variables") # Setup data self.data = data # TODO: Make data self.embeddings = tf.constant(data.word_embed_util.embeddings) # constants # leaf constant output o_none = tf.constant(-1.0, shape=[FLAGS.label_size, 1]) # loss weight constant w>1 more weight on sensitive loss self.weight = tf.constant(FLAGS.sensitive_weight) # tree structure placeholders self.root_array = tf.placeholder(tf.int32, (None), name='root_array') self.is_leaf_array = tf.placeholder(tf.bool, (None), name='is_leaf_array') self.word_index_array = tf.placeholder(tf.int32, (None), name='word_index_array') self.left_child_array = tf.placeholder(tf.int32, (None), name='left_child_array') self.right_child_array = tf.placeholder(tf.int32, (None), name='right_child_array') self.label_array = tf.placeholder(tf.int32, (None, FLAGS.label_size), name='label_array') # initializers xavier_initializer = tf.contrib.layers.xavier_initializer() weight_initializer = xavier_initializer if FLAGS.weight_initializer == "identity": def custom_initializer(shape_list, dtype, partition_info): return tf.initializers.identity(gain=0.5)(shape_list, dtype, partition_info) + tf.initializers.random_uniform(minval=-0.05, maxval=0.05)( shape_list, dtype, partition_info) weight_initializer = custom_initializer bias_initializer = xavier_initializer if FLAGS.bias_initializer == "zero": bias_initializer = tf.initializers.zeros() # encoding variables W_l = tf.get_variable(name='W_l', shape=[FLAGS.sentence_embedding_size, FLAGS.word_embedding_size], initializer=weight_initializer) W_r = tf.get_variable(name='W_r', shape=[FLAGS.sentence_embedding_size, FLAGS.word_embedding_size], initializer=weight_initializer) self.W_l = W_l self.W_r = W_r # phrase weights U_l = tf.get_variable(name='U_l', shape=[FLAGS.sentence_embedding_size, FLAGS.sentence_embedding_size], initializer=weight_initializer) U_r = tf.get_variable(name='U_r', shape=[FLAGS.sentence_embedding_size, FLAGS.sentence_embedding_size], initializer=weight_initializer) self.U_l = U_l self.U_r = U_r # bias b_W = tf.get_variable(name='b_W', shape=[FLAGS.sentence_embedding_size, 1], initializer=bias_initializer) b_U = tf.get_variable(name='b_U', shape=[FLAGS.sentence_embedding_size, 1], initializer=bias_initializer) # classifier weights V = tf.get_variable(name='V', shape=[FLAGS.label_size, FLAGS.sentence_embedding_size], initializer=xavier_initializer) b_p = tf.get_variable(name='b_p', shape=[FLAGS.label_size, 1], initializer=bias_initializer) self.V = V self.b_p = b_p helper._print_header("Constructing tRNN structure") # phrase node tensors rep_array = tf.TensorArray( tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) o_array = tf.TensorArray( tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) helper._print_header("Building tRNN tree structure") # build the tRNN structure def embed_word(word_index): return tf.nn.embedding_lookup(self.embeddings, word_index) def build_node(left_child, right_child, rep_array): left_is_leaf = tf.gather(self.is_leaf_array, left_child) right_is_leaf = tf.gather(self.is_leaf_array, right_child) # reshape from vector to matrix with height 300 and width 1 rep_l = tf.reshape(rep_array.read(left_child), [300, 1]) rep_r = tf.reshape(rep_array.read(right_child), [300, 1]) left = tf.cond( left_is_leaf, lambda: tf.matmul(W_l, rep_l) + b_W, lambda: tf.matmul(U_l, rep_l) + b_U ) right = tf.cond( right_is_leaf, lambda: tf.matmul(W_r, rep_r) + b_W, lambda: tf.matmul(U_r, rep_r) + b_U ) # relu( (sent_size , 1) + (sent_size , 1) + (sent_size , 1) ) = (sent_size , 1) return tf.nn.leaky_relu(left + right) def tree_construction_body(rep_array, o_array, i): # gather variables is_leaf = tf.gather(self.is_leaf_array, i) word_index = tf.gather(self.word_index_array, i) left_child = tf.gather(self.left_child_array, i) right_child = tf.gather(self.right_child_array, i) # embed_word = (word_size, 1) # build_node = (sent_size , 1) rep = tf.cond( is_leaf, lambda: embed_word(word_index), lambda: build_node(left_child, right_child, rep_array) ) rep_array = rep_array.write(i, rep) # o_none = (label_size, 1) # softmax( (label_size, sent_size) * (sent_size, 1) + (label_size, 1)) = (label_size, 1) o = tf.cond( is_leaf, lambda: o_none, lambda: tf.matmul(V, rep) + b_p # TODO maybe with out activation function ) o_array = o_array.write(i, o) i = tf.add(i, 1) return rep_array, o_array, i termination_cond = lambda rep_a, o_a, i: tf.less(i, tf.squeeze(tf.shape(self.is_leaf_array))) tf.print('hello', (self.is_leaf_array), output_stream=sys.stderr) self.rep_array, self.o_array, _ = tf.while_loop( cond=termination_cond, body=tree_construction_body, loop_vars=(rep_array, o_array, 0), parallel_iterations=1 ) self.loss = self.get_loss() self.acc = self.get_acc_batch() self.global_step = tf.train.create_global_step() if FLAGS.lr_decay: n = int(len(self.data.train_trees) / FLAGS.batch_size) total_steps = FLAGS.epochs * n decay_steps = n decay_rate = (FLAGS.learning_rate_end / FLAGS.learning_rate) ** (decay_steps / total_steps) self.learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, self.global_step, decay_steps, decay_rate, name='learning_rate') helper._print_header("Using learning rate with exponential decay") helper._print("Decay for every step:", decay_rate) helper._print("Learning rate start:", FLAGS.learning_rate) helper._print("Learning rate end:", FLAGS.learning_rate_end) helper._print("After number of epochs", FLAGS.epochs) else: self.learning_rate = tf.constant(FLAGS.learning_rate) if FLAGS.optimizer == constants.ADAM_OPTIMIZER: self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) else: # FLAGS.optimizer == constants.ADAGRAD_OPTIMIZER: self.train_op = tf.train.AdagradOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) self.init = tf.global_variables_initializer() tf.summary.scalar('loss', self.loss) tf.summary.scalar('accuracy', self.acc) self.merged_summary_op = tf.summary.merge_all()
def cross_validation(): data = get_data() x_train, y_train = data['train'] x_val, y_val = data['val'] x_test, y_test = data['test'] helper._print_header('Searching the parameter space') params = { 'lr': [0.1, 0.01], 'optimizer': [Adagrad], 'activation': [relu], 'dropout': [0, 0.2, 0.5], 'regularization': [0, 0.01, 0.001], # 'weights1': [1, 2], # 'weights2': [1], 'loss_functions': ['categorical_crossentropy'], 'layers': [1, 3], 'layer_size': [100, 300], 'batch_size': [4], } paramsTest = { 'lr': [0.1], 'optimizer': [Adagrad], 'activation': [relu], 'dropout': [0.2], 'regularization': [0.01], # 'weights1': [2], # 'weights2': [1], 'loss_functions': ['categorical_cross_entropy'], 'layers': [3], 'layer_size': [100], 'batch_size': [64, 4], } ta.Scan( model=mlp_model, x=x_train, y=y_train, x_val=x_val, y_val=y_val, params=params, dataset_name=FLAGS.model_name, experiment_no='Adagrad_V1', clear_tf_session=False, print_params=False, ) ta.Scan( model=mlp_model, x=x_train, y=y_train, x_val=x_val, y_val=y_val, params=params, dataset_name=FLAGS.model_name, experiment_no='Adagrad_V2', clear_tf_session=False, print_params=False, ) ta.Scan( model=mlp_model, x=x_train, y=y_train, x_val=x_val, y_val=y_val, params=params, dataset_name=FLAGS.model_name, experiment_no='Adagrad_V3', clear_tf_session=False, print_params=False, )
def __init__(self, data): """ :param data: utils.data """ helper._print_header( "Constructing treeRNN friendly constants, placeholders and variables" ) # Setup data self.data = data # TODO: Make data self.embeddings = tf.constant(data.word_embed_util.embeddings) # constants dasdsa # leaf constant output self.rep_zero = tf.constant(0., shape=[FLAGS.sentence_embedding_size, 1]) self.word_zero = tf.constant(0., shape=[FLAGS.word_embedding_size, 1]) self.label_zero = tf.constant(0., shape=[FLAGS.label_size, 1]) # loss weight constant w>1 more weight on sensitive loss self.weight = tf.constant(FLAGS.sensitive_weight) # tree structure placeholders self.root_array = tf.placeholder(tf.int32, (None), name='root_array') self.is_leaf_array = tf.placeholder(tf.bool, (None, None), name='is_leaf_array') self.word_index_array = tf.placeholder(tf.int32, (None, None), name='word_index_array') self.left_child_array = tf.placeholder(tf.int32, (None, None), name='left_child_array') self.right_child_array = tf.placeholder(tf.int32, (None, None), name='right_child_array') self.label_array = tf.placeholder(tf.int32, (None, None, FLAGS.label_size), name='label_array') # initializers xavier_initializer = tf.contrib.layers.xavier_initializer() weight_initializer = xavier_initializer if FLAGS.weight_initializer == "identity": def custom_initializer(shape_list, dtype, partition_info): return tf.initializers.identity(gain=0.5)( shape_list, dtype, partition_info) + tf.initializers.random_uniform( minval=-0.05, maxval=0.05)(shape_list, dtype, partition_info) weight_initializer = custom_initializer bias_initializer = xavier_initializer if FLAGS.bias_initializer == "zero": bias_initializer = tf.initializers.zeros() # encoding variables W = tf.get_variable( name='W', shape=[FLAGS.sentence_embedding_size, FLAGS.word_embedding_size], initializer=weight_initializer) self.W = W # phrase weights U_l = tf.get_variable(name='U_l', shape=[ FLAGS.sentence_embedding_size, FLAGS.sentence_embedding_size ], initializer=weight_initializer) U_r = tf.get_variable(name='U_r', shape=[ FLAGS.sentence_embedding_size, FLAGS.sentence_embedding_size ], initializer=weight_initializer) self.U_l = U_l self.U_r = U_r self.weights = tf.concat([W, U_l, U_r], axis=1) # bias self.b = tf.get_variable(name='b', shape=[FLAGS.sentence_embedding_size, 1], initializer=bias_initializer) # classifier weights V = tf.get_variable( name='V', shape=[FLAGS.label_size, FLAGS.sentence_embedding_size], initializer=xavier_initializer) b_p = tf.get_variable(name='b_p', shape=[FLAGS.label_size, 1], initializer=bias_initializer) self.V = V self.b_p = b_p helper._print_header("Constructing tRNN structure") # phrase node tensors rep_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) rep_array = rep_array.write(0, self.rep_zero) o_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) o_array = o_array.write(0, self.label_zero) word_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False) word_array = word_array.write(0, self.word_zero) helper._print_header("Building tRNN tree structure") batch_indices = [[i, i] for i in range(FLAGS.batch_size)] def gather_rep(step, children_indices, rep_a): children = tf.squeeze(tf.gather(children_indices, step, axis=1)) return tf.gather_nd(rep_a.gather(children), batch_indices) # build the tRNN structure def embed_word(word_index): return tf.nn.embedding_lookup(self.embeddings, word_index) # return tf.cond( # is_leaf, # lambda: tf.nn.embedding_lookup(self.embeddings, word_index), # lambda: self.word_zero # ) def build_node(i, rep_array, word_array): # reshape from vector to matrix with height 300 and width 1 print_op = tf.print("i:", i, "right children:", tf.squeeze( tf.gather(self.left_child_array, i, axis=1)), output_stream=sys.stdout) with tf.control_dependencies([print_op]): rep_l = gather_rep(i, self.left_child_array, rep_array) rep_r = gather_rep(i, self.right_child_array, rep_array) rep_word = word_array.read(i) left = tf.matmul(rep_l, self.U_l) right = tf.matmul(rep_r, self.U_r) word = tf.matmul(rep_word, self.W) return tf.nn.leaky_relu(word + left + right + self.b) def tree_construction_body(rep_array, word_array, o_array, i): # gather variables word_index = tf.gather(self.word_index_array, i) # embed_word = (word_size, 1) word_emb = embed_word(word_index) word_array = word_array.write(i, word_emb) # build_node = (sent_size , 1) rep = build_node(i, rep_array, word_array) rep_array = rep_array.write(i, rep) o = tf.matmul(V, rep) + b_p o_array = o_array.write(i, o) i = tf.add(i, 1) return rep_array, word_array, o_array, i termination_cond = lambda rep_a, word_a, o_a, i: tf.less( i, tf.gather(tf.shape(self.is_leaf_array), 1)) self.rep_array, self.word_array, self.o_array, _ = tf.while_loop( cond=termination_cond, body=tree_construction_body, loop_vars=(rep_array, word_array, o_array, 1), parallel_iterations=1) self.loss = self.get_loss() self.acc = self.get_acc_batch() self.global_step = tf.train.create_global_step() if FLAGS.lr_decay: n = int(len(self.data.train_trees) / FLAGS.batch_size) total_steps = FLAGS.epochs * n decay_steps = n decay_rate = (FLAGS.learning_rate_end / FLAGS.learning_rate)**(decay_steps / total_steps) self.learning_rate = tf.train.exponential_decay( FLAGS.learning_rate, self.global_step, decay_steps, decay_rate, name='learning_rate') helper._print_header("Using learning rate with exponential decay") helper._print("Decay for every step:", decay_rate) helper._print("Learning rate start:", FLAGS.learning_rate) helper._print("Learning rate end:", FLAGS.learning_rate_end) helper._print("After number of epochs", FLAGS.epochs) else: self.learning_rate = FLAGS.learning_rate if FLAGS.optimizer == constants.ADAM_OPTIMIZER: self.train_op = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss, global_step=self.global_step) else: # FLAGS.optimizer == constants.ADAGRAD_OPTIMIZER: self.train_op = tf.train.AdagradOptimizer( self.learning_rate).minimize(self.loss, global_step=self.global_step) self.init = tf.global_variables_initializer() tf.summary.scalar('loss', self.loss) tf.summary.scalar('accuracy', self.acc) self.merged_summary_op = tf.summary.merge_all()
def build_finetuned_embeddings(self): helper._print_header('Getting fine-tuned GloVe embeddings') self.glove_download_pretrained_model() vocab, _, _ = self.train_and_save_finetuned_embeddings() return self.generate_indexes( vocab, directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH)
def on_epoch_begin(self, model): helper._print_header(f'Epoch: {self.epoch}/{model.iter}') self.epoch += 1
def train_old(self): helper._print_header("Training tRNN") helper._print("Test ration:", tree_util.ratio_of_labels(self.data.test_trees)) helper._print("Validation ration:", tree_util.ratio_of_labels(self.data.val_trees)) helper._print("Train ration:", tree_util.ratio_of_labels(self.data.train_trees)) # todo make a flag for this config = tf.ConfigProto(device_count={'GPU': 0}) with tf.Session(config=config) as sess: model_placement = FLAGS.models_dir + FLAGS.model_name + "model.ckpt" # Summary writer for both the training and the set acc and loss - used for tensorboard self.make_needed_dir() directory = FLAGS.logs_dir + FLAGS.model_name train_writer = tf.summary.FileWriter(directory + 'train', sess.graph) validation_writer = tf.summary.FileWriter(directory + 'validation') test_writer = tf.summary.FileWriter(directory + 'test') history = self.get_history() starting_steps = 0 best_acc = 0 # Run the init saver = tf.train.Saver() self.run_tensorboard() if FLAGS.load_model: history, starting_steps, best_acc = self.load_history() helper._print("Previously", starting_steps, "steps has been ran, best acc was:", best_acc) self.load_model(sess, model_placement, saver) self.write_history_to_summary(history, train_writer, validation_writer, test_writer) sess.run(tf.assign(self.global_step, starting_steps)) else: sess.run(self.init) self.handle_val_test(history, sess, test_writer, 0, validation_writer) start_time = time.time() loss_total = 0 acc_total = 0 for epoch in range(FLAGS.epochs): helper._print_header("Epoch " + str(epoch + 1)) batch_size = (FLAGS.batch_size if epoch >= 10 else 1) print_interval = FLAGS.print_step_interval / batch_size for step, tree in enumerate( helper.batches( np.random.permutation(self.data.train_trees), batch_size)): # todo build train get_trees if step % int(print_interval) == 0: total_step = starting_steps + epoch * int( len(self.data.train_trees)) + step * batch_size helper._print("Step:", total_step) helper._print("Learning rate:", sess.run(self.learning_rate)) avg_acc = acc_total / print_interval avg_loss = loss_total / print_interval if epoch != 0 or step != 0: self.write_to_summary(avg_acc, avg_loss, total_step, train_writer) helper._print("Train - acc:", avg_acc, "loss:", avg_loss) history["train"].append( (total_step, avg_acc, avg_loss)) val_acc = self.handle_val_test( history, sess, test_writer, total_step, validation_writer) loss_total = 0 acc_total = 0 if val_acc > best_acc: best_acc = val_acc helper._print("A better model was found!") saver.save(sess, model_placement) np.savez(FLAGS.histories_dir + FLAGS.model_name + 'history.npz', train=history["train"], test=history["test"], val=history["val"], total_steps=total_step, best_acc=best_acc) helper._print("Model saved!") feed_dict = self.build_feed_dict_batch( tree) # todo maybe change to batches _, acc, loss = sess.run( [self.train_op, self.acc, self.loss], feed_dict=feed_dict) acc_total += acc loss_total += loss helper._print("Avg Epoch Time:", (time.time() - start_time) / (epoch + 1) / 60, "m")