Пример #1
0
    def word2vec_trained_embeddings(self):
        helper._print_header('Getting word2vec trained on Enron corpus...')
        if not os.path.isdir(FLAGS.word2vec_dir):
            os.makedirs(FLAGS.word2vec_dir)
        documents = self.get_enron_sentences()
        model_logger = Word2VecLogger()
        if os.path.isfile(FLAGS.word2vec_dir + 'word2vec.model'):
            helper._print_subheader('Loading previously trained model...')
            model = KeyedVectors.load(FLAGS.word2vec_dir + 'word2vec.model')
        else:
            helper._print_subheader('Building model...')
            model = Word2Vec(
                documents,
                size=300,
                sg=1,  # Use Skip-Gram (0 for CBOW)
                hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
                window=10,
                min_count=3,
                workers=10,
                iter=1)
            helper._print_subheader('Saving untrained model...')
            model.save(FLAGS.word2vec_dir + 'word2vec.model')
        model.train(documents,
                    total_examples=len(documents),
                    epochs=FLAGS.word2vec_trained_mode_epochs,
                    callbacks=[model_logger])
        helper._print_subheader('Saving model...')
        model.save(FLAGS.word2vec_dir + 'trained_word2vec.model')

        return self.word2vec_index_keyed_vector(model.wv)
Пример #2
0
    def glove_finetuned_embeddings(self):
        helper._print_header('Getting fine-tuned GloVe embeddings')
        self.glove_download_pretrained_model()
        sentences = self.get_enron_sentences()
        vocab = helper.get_or_build(FLAGS.enron_emails_vocab_path,
                                    self.build_vocab, sentences)
        # idx2word = {i: word for word, i in word2idx.items()}
        print(len(vocab))
        cooccur = helper.get_or_build(FLAGS.enron_emails_cooccur_path,
                                      self.build_cooccur,
                                      vocab,
                                      sentences,
                                      type='numpy')
        print(np.shape(cooccur))
        pretrained_embeddings = self.glove2dict(self.word_embed_file_path)
        helper._print_subheader('Starting Mittens model...')
        mittens_model = Mittens(n=self.dimensions,
                                max_iter=1000,
                                display_progress=1,
                                log_dir=FLAGS.glove_dir + 'mittens/')
        finetuned_embeddings = mittens_model.fit(
            cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings)
        print(finetuned_embeddings)

        return 'test', 'test', 'test'
Пример #3
0
 def build_trained_embeddings(self):
     helper._print_header('Getting word2vec trained on Enron corpus...')
     if not os.path.isdir(directories.WORD2VEC_DIR):
         os.makedirs(directories.WORD2VEC_DIR)
     sentences = self.get_enron_sentences()
     model_logger = Word2VecLogger()
     path = directories.WORD2VEC_DIR + 'trained_word2vec.model'
     if os.path.isfile(path):
         helper._print('Loading previously trained model...')
         word2vec_model = KeyedVectors.load(path)
     else:
         helper._print_subheader('Building model...')
         word2vec_model = gensim.models.Word2Vec(
             sentences,
             size=FLAGS.word_embedding_size,
             sg=1,  # Use Skip-Gram (0 for CBOW)
             hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
             window=FLAGS.word2vec_window,
             min_count=FLAGS.word2vec_min_count,
             workers=10,
             iter=1
         )
         pool = multiprocessing.Pool()
         word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger])
         # word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs)
         helper._print(f'Saving model to {path}')
         word2vec_model.save(path)
     vocab = self.build_vocab(sentences)
     return self.word2vec_index_keyed_vector(keyed_vector=word2vec_model.wv, vocab=vocab)
Пример #4
0
 def build_pretrained_embeddings(self):
     helper._print_header('Getting pretrained GloVe embeddings')
     self.glove_download_pretrained_model()
     sentences = self.get_enron_sentences()
     vocab = self.build_vocab(sentences)
     return self.generate_indexes(vocab,
                                  directories.GLOVE_EMBEDDING_FILE_PATH)
Пример #5
0
    def __init__(self):
        helper._print_header(f"Loading tree data ({FLAGS.dataset})")
        self.train_trees = tree_util.parse_trees(dataset=FLAGS.dataset)
        self.test_trees = tree_util.parse_trees(dataset=FLAGS.dataset,
                                                type='test')
        self.val_trees = tree_util.parse_trees(dataset=FLAGS.dataset,
                                               type="val")

        self.make_tree_text_file()
Пример #6
0
 def build_pretrained_embeddings(self):
     helper._print_header('Getting pretrained fastText embeddings')
     if self.dimensions != 300:
         raise NotImplementedError(
             'Only word vectors of size 300 are available at this point.')
     self.download_fastText_vectors()
     sentences = self.get_enron_sentences()
     vocab = self.build_vocab(sentences)
     return self.generate_indexes(vocab,
                                  directories.FASTTEXT_EMBEDDING_FILE_PATH)
Пример #7
0
    def print_performance(self):
        helper._print_header("Final stats for best model")

        helper._print("Best epoch:", self.speed["best_epoch"])
        helper._print("Total running time:",
                      str(int(self.speed["best_time"] / (60 * 60))) + "h",
                      str((int(self.speed["best_time"] / 60) % 60)) + "m")
        helper._print("Total epochs:", self.speed["epoch"])
        helper._print("Total running time:",
                      str(int(self.speed["total_time"] / (60 * 60))) + "h",
                      str((int(self.speed["total_time"] / 60) % 60)) + "m")

        helper._print("Test:", self.performance_test)
        helper._print("Val:", self.performance_val)
        helper._print("train:", self.performance_train)
Пример #8
0
 def glove_download_pretrained_model(self):
     self.word_embed_file_path = FLAGS.glove_dir + self.embedding_file + '.' + str(
         self.dimensions) + 'd.txt'
     self.glove_zip_path = FLAGS.glove_dir + 'glove.zip'
     if not os.path.isdir(FLAGS.glove_dir):
         os.makedirs(FLAGS.glove_dir)
     if not os.path.isfile(self.word_embed_file_path):
         helper._print_header('Downloading GloVe embedding: {0}'.format(
             self.embedding_file))
         url = 'http://nlp.stanford.edu/data/wordvecs/' + self.embedding_file + '.zip'
         helper.download(url, self.glove_zip_path)
         with zipfile.ZipFile(self.glove_zip_path, 'r') as zip:
             helper._print_header(
                 f'Extracting glove weights from {self.glove_zip_path} ')
             zip.extractall(path=FLAGS.glove_dir)
Пример #9
0
 def glove_download_pretrained_model(self):
     if not os.path.isdir(directories.GLOVE_DIR):
         os.makedirs(directories.GLOVE_DIR)
     if not os.path.isfile(directories.GLOVE_EMBEDDING_FILE_PATH):
         helper._print_header('Downloading GloVe embedding: {0}'.format(
             directories.GLOVE_EMBEDDING_FILE_NAME))
         url = constants.GLOVE_URL + directories.GLOVE_EMBEDDING_FILE_NAME + '.zip'
         print(url)
         helper.download(url, directories.GLOVE_EMBEDDING_ZIP_PATH)
         with zipfile.ZipFile(directories.GLOVE_EMBEDDING_ZIP_PATH,
                              'r') as zip:
             helper._print_header(
                 f'Extracting glove weights from {directories.GLOVE_EMBEDDING_ZIP_PATH} '
             )
             zip.extractall(path=directories.GLOVE_DIR)
Пример #10
0
 def build_finetuned_embeddings(self):
     helper._print_header('Getting fine-tuned word2vec embeddings')
     path = directories.WORD2VEC_DIR + 'finetuned_word2vec.model'
     pretrained_path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH
     sentences = self.get_enron_sentences()
     if not os.path.isdir(directories.WORD2VEC_DIR):
         os.makedirs(directories.WORD2VEC_DIR)
     if os.path.isfile(path):
         helper._print_subheader('Loading previously fine-tuned model...')
         finetuned_model = {}
         finetuned_model.wv = KeyedVectors.load(path)
     else:
         if not self.dimensions == 300:
             helper._print('Only support word2vec with vectors of size 300')
             sys.exit()
         if not os.path.isfile(pretrained_path):
             helper._print(
                 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM')
             sys.exit()
         helper._print_subheader('Unpacking ' + pretrained_path)
         model = KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
         helper._print_subheader('Done unpacking!')
         finetuned_model = gensim.models.Word2Vec(
             size=FLAGS.word_embedding_size,
             sg=1,  # Use Skip-Gram (0 for CBOW)
             hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
             window=FLAGS.word2vec_window,
             min_count=FLAGS.word2vec_min_count,
             workers=10,
             iter=1
         )
         helper._print_subheader('Building fine-tuned model vocab...')
         finetuned_model.build_vocab(sentences)
         helper._print_subheader('Updating with pretrained model vocab...')
         finetuned_model.build_vocab([list(model.vocab.keys())], update=True)
         helper._print_subheader('Intersection with pretrained vectors...')
         finetuned_model.intersect_word2vec_format(pretrained_path, binary=True, lockf=1.0)
         model_logger = Word2VecLogger()
         finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs,
                               callbacks=[model_logger])
         helper._print_subheader('Saving model...')
         finetuned_model.save(path)
     vocab = self.build_vocab(sentences)
     return self.word2vec_index_keyed_vector(keyed_vector=finetuned_model.wv, vocab=vocab)
Пример #11
0
    def build_pretrained_embeddings(self):
        helper._print_header('Getting pretrained word2vec embeddings')
        path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH
        sentences = self.get_enron_sentences()
        if not os.path.isdir(directories.WORD2VEC_DIR):
            os.makedirs(directories.WORD2VEC_DIR)
        if not self.dimensions == 300:
            helper._print('Only support word2vec with vectors of size 300')

        if not os.path.isfile(path):
            helper._print(
                'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM')
            sys.exit()
        else:
            helper._print_subheader('Unpacking ' + path)
            model = KeyedVectors.load_word2vec_format(path, binary=True)
            helper._print_subheader('Done unpacking!')
            vocab = self.build_vocab(sentences)
            return self.word2vec_index_keyed_vector(keyed_vector=model, vocab=vocab)
Пример #12
0
    def load_enron_txt_data(self):
        helper._print_header("Loading Enron emails")
        try:
            if os.name == 'nt':
                """
                Using sys.maxsize throws an Overflow error on Windows 64-bit platforms since internal
                representation of 'int'/'long' on Win64 is only 32-bit wide. Ideally limit on Win64
                should not exceed ((2**31)-1) as long as internal representation uses 'int' and/or 'long'
                """
                csv.field_size_limit((2**31) - 1)
            else:
                csv.field_size_limit(sys.maxsize)
        except OverflowError as e:
            # skip setting the limit for now
            pass
        if not os.path.isfile(directories.ENRON_EMAILS_CSV_PATH):
            data = 'wcukierski/enron-email-dataset'
            helper._print_subheader(f'Downloading enron emails from Kaggle')
            helper.download_from_kaggle(data, directories.ENRON_DIR)
            helper._print_subheader('Download finished! Unzipping...')
            with zipfile.ZipFile(directories.ENRON_EMAILS_ZIP_PATH,
                                 'r') as zip:
                zip.extractall(path=directories.ENRON_DIR)
        if not os.path.isfile(directories.ENRON_EMAILS_TXT_PATH):
            helper._print_subheader('Processing emails into .txt file!')
            with open(directories.ENRON_EMAILS_CSV_PATH, 'r',
                      encoding='utf-8') as emails_csv:
                with open(directories.ENRON_EMAILS_TXT_PATH,
                          'w',
                          encoding='utf-8') as text_file:
                    email_reader = csv.reader(emails_csv, delimiter=",")
                    for index, row in enumerate(email_reader):
                        if index == 0:
                            continue
                        sentences = nltk.sent_tokenize(
                            self.format_email_body(row))
                        for sent in sentences:
                            if len(sent.split(' ')) > 2:
                                text_file.write(sent + '\n')
                        if index % 100000 == 0 and index != 0:
                            helper._print(f'{index} emails processed')

        helper._print_subheader('Enron email data loaded!')
Пример #13
0
 def word2vec_finetuned_embeddings(self):
     helper._print_header('Getting fine-tuned word2vec embeddings')
     if not os.path.isdir(FLAGS.word2vec_dir):
         os.makedirs(FLAGS.word2vec_dir)
     if os.path.isfile(FLAGS.word2vec_dir + 'finetuned_word2vec.model'):
         helper._print_subheader('Loading previously fine-tuned model...')
         finetuned_model = {}
         finetuned_model.wv = KeyedVectors.load(FLAGS.word2vec_dir +
                                                'word2vec.model')
     else:
         if not self.dimensions == 300:
             helper._print('Only support word2vec with vectors of size 300')
             sys.exit()
         binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin'
         if not os.path.isfile(binary_file_path):
             helper._print(
                 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM'
             )
             sys.exit()
         helper._print_subheader('Unpacking ' + binary_file_path)
         model = KeyedVectors.load_word2vec_format(binary_file_path,
                                                   binary=True)
         helper._print_subheader('Done unpacking!')
         sentences = self.get_enron_sentences()
         finetuned_model = Word2Vec(size=300, min_count=3)
         helper._print_subheader('Building fine-tuned model vocab...')
         finetuned_model.build_vocab(sentences)
         helper._print_subheader('Updating with pretrained model vocab...')
         finetuned_model.build_vocab([list(model.vocab.keys())],
                                     update=True)
         helper._print_subheader('Intersection with pretrained vectors...')
         finetuned_model.intersect_word2vec_format(binary_file_path,
                                                   binary=True,
                                                   lockf=1.0)
         model_logger = Word2VecLogger()
         finetuned_model.train(sentences,
                               total_examples=len(sentences),
                               epochs=FLAGS.word2vec_finetuned_mode_epochs,
                               callbacks=[model_logger])
         helper._print_subheader('Saving model...')
         model.save(FLAGS.word2vec_dir + 'finetuned_word2vec.model')
     return self.word2vec_index_keyed_vector(finetuned_model.wv)
Пример #14
0
    def word2vec_pretrained_embeddings(self):
        helper._print_header('Getting pretrained word2vec embeddings')
        if not os.path.isdir(FLAGS.word2vec_dir):
            os.makedirs(FLAGS.word2vec_dir)
        self.word_embed_file_path = FLAGS.word2vec_dir + self.embedding_file + '.txt'
        if not self.dimensions == 300:
            helper._print('Only support word2vec with vectors of size 300')

        if not os.path.isfile(self.word_embed_file_path):
            binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin'
            if not os.path.isfile(binary_file_path):
                helper._print(
                    'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM'
                )
                sys.exit()
            else:
                helper._print_subheader('Unpacking ' + binary_file_path)
                model = KeyedVectors.load_word2vec_format(binary_file_path,
                                                          binary=True)
                helper._print_subheader('Done unpacking!')
                return self.word2vec_index_keyed_vector(model)
Пример #15
0
    def __init__(self,
                 model,
                 sess,
                 saver,
                 summary,
                 load,
                 gpu,
                 batch_size=FLAGS.batch_size,
                 epochs=FLAGS.epochs):
        helper._print_header("Training " + model.model_name)
        helper._print("Load model:", load)
        helper._print("Model:", model.__class__.__name__)
        helper._print("Use GPU:", gpu)
        helper._print("Test ration:",
                      tree_util.ratio_of_labels(model.data.test_trees))
        helper._print("Validation ration:",
                      tree_util.ratio_of_labels(model.data.val_trees))
        helper._print("Train ration:",
                      tree_util.ratio_of_labels(model.data.train_trees))
        helper._print("Batch size:", batch_size)
        helper._print("Max epochs:", epochs)
        helper._print("Convergence epochs:", FLAGS.conv_cond)
        helper._print("Max pre-training epochs:", FLAGS.pretrain_max_epoch)
        helper._print("L2-Regularization Scalar", FLAGS.l2_scalar)
        helper._print("Dropout probability:", FLAGS.dropout_prob)
        helper._print("Representation Size:", FLAGS.sentence_embedding_size)
        helper._print(
            "Word embedding model:",
            FLAGS.word_embed_model + ' (' + FLAGS.word_embed_mode + ')')
        helper._print("Use root loss:", FLAGS.use_root_loss)
        helper._print("Use selective training:", FLAGS.use_selective_training)
        helper._print("Decay for every step:", FLAGS.lr_decay)
        helper._print("Learning rate start:", FLAGS.learning_rate)

        self.model = model
        self.batch_size = batch_size
        self.sess = sess
        self.saver = saver
        self.summary = summary
Пример #16
0
def train_classifier():
    data = get_data()

    classifier = Sequential()
    classifier.add(
        Dense(100,
              activation=tf.nn.relu,
              input_shape=(FLAGS.sentence_embedding_size, )))
    for i in range(1 - 1):
        classifier.add(
            Dense(100,
                  activation='relu',
                  kernel_regularizer=tf.keras.regularizers.l2(0.3)))
        classifier.add(Dropout(0.5))
    classifier.add(Dense(2, activation='softmax'))
    classifier.compile(optimizer=Adagrad(0.01),
                       loss='categorical_crossentropy',
                       metrics=['accuracy',
                                Recall(),
                                Precision(), f1])

    classifier.summary()

    helper._print_header('Training classifier')

    classifier.fit(data['train'][0],
                   data['train'][1],
                   batch_size=FLAGS.classifier_batch_size,
                   validation_data=(data['val'][0], data['val'][1]),
                   epochs=200,
                   callbacks=[
                       EarlyStopping(monitor='val_accuracy',
                                     patience=25,
                                     min_delta=0.01),
                       SaveBestModelCallback()
                   ],
                   verbose=2)
Пример #17
0
    def build_model(self):

        helper._print_header("Constructing tRNN structure")

        # phrase node tensors
        rep_array = tf.TensorArray(tf.float32,
                                   size=0,
                                   dynamic_size=True,
                                   clear_after_read=False,
                                   infer_shape=False)

        o_array = tf.TensorArray(tf.float32,
                                 size=0,
                                 dynamic_size=True,
                                 clear_after_read=False,
                                 infer_shape=False)

        helper._print_header("Building tRNN tree structure")

        # build the tRNN structure
        def embed_word(word_index):
            return tf.nn.embedding_lookup(self.embeddings, word_index)

        def build_node(left_child, right_child, rep_array):
            left_is_leaf = tf.gather(self.is_leaf_array, left_child)
            right_is_leaf = tf.gather(self.is_leaf_array, right_child)

            # reshape from vector to matrix with height 300 and width 1
            rep_l = tf.reshape(rep_array.read(left_child), [300, 1])
            rep_r = tf.reshape(rep_array.read(right_child), [300, 1])

            left = tf.cond(left_is_leaf,
                           lambda: tf.matmul(self.W_l, rep_l) + self.b_W,
                           lambda: tf.matmul(self.U_l, rep_l) + self.b_U)

            right = tf.cond(right_is_leaf,
                            lambda: tf.matmul(self.W_r, rep_r) + self.b_W,
                            lambda: tf.matmul(self.U_r, rep_r) + self.b_U)

            # relu( (sent_size , 1) + (sent_size , 1) + (sent_size , 1) )  = (sent_size , 1)
            return tf.nn.leaky_relu(left + right)

        def tree_construction_body(rep_array, o_array, i):
            # gather variables
            is_leaf = tf.gather(self.is_leaf_array, i)
            word_index = tf.gather(self.word_index_array, i)
            left_child = tf.gather(self.left_child_array, i)
            right_child = tf.gather(self.right_child_array, i)

            # embed_word = (word_size, 1)
            # build_node = (sent_size , 1)
            rep = tf.cond(
                is_leaf, lambda: embed_word(word_index),
                lambda: build_node(left_child, right_child, rep_array))
            if FLAGS.dropout_prob > 0:
                rep = tf.nn.dropout(rep, rate=self.dropout_rate)
            rep_array = rep_array.write(i, rep)

            # o_none = (label_size, 1)
            # softmax( (label_size, sent_size) * (sent_size, 1) + (label_size, 1)) = (label_size, 1)
            o = tf.cond(
                is_leaf,
                lambda: self.o_none,
                lambda: tf.matmul(self.V, rep) + self.
                b_p  # TODO maybe with out activation function
            )
            o_array = o_array.write(i, o)

            i = tf.add(i, 1)
            return rep_array, o_array, i

        termination_cond = lambda rep_a, o_a, i: tf.less(
            i, tf.squeeze(tf.shape(self.is_leaf_array)))

        tf.print('hello', (self.is_leaf_array), output_stream=sys.stderr)

        self.rep_array, self.o_array, _ = tf.while_loop(
            cond=termination_cond,
            body=tree_construction_body,
            loop_vars=(rep_array, o_array, 0),
            parallel_iterations=1)
Пример #18
0
 def build_trained_embeddings(self):
     helper._print_header('Getting trained GloVe embeddings')
     vocab, _, _ = self.train_and_save_embeddings()
     return self.generate_indexes(
         vocab, directories.TRAINED_GLOVE_EMBEDDING_FILE_PATH)
Пример #19
0
 def glove_pretrained_embeddings(self):
     helper._print_header('Getting pretrained GloVe embeddings')
     self.glove_download_pretrained_model()
     return self.glove_generate_indexes()
Пример #20
0
def selective_train(model,
                    load=False,
                    gpu=True,
                    batch_size=FLAGS.batch_size,
                    epochs=FLAGS.epochs,
                    run_times=[],
                    epoch_times=[],
                    conv_cond=FLAGS.conv_cond,
                    num_threads=FLAGS.num_threads):
    if gpu:
        config = None
    else:
        config = tf.ConfigProto(device_count={'GPU': 0})

    with tf.Session(config=config) as sess:

        # initialization
        saver = tf.train.Saver()
        selector = Selector(model, sess, FLAGS.num_clusters,
                            FLAGS.cluster_model)
        summary = summarizer(model.model_name, sess)
        summary.construct_dir()
        trainer = Trainer(model,
                          sess,
                          saver,
                          summary,
                          load=load,
                          gpu=gpu,
                          batch_size=batch_size)

        if load:
            model.load_tmp(sess, saver)
            summary.load()
        else:
            model.initialize(sess)
            summary.initialize()

        summary.save_parameters(
            lr=model.learning_rate,
            lr_end=model.learning_rate_end,
            gpu=gpu,
            lr_decay=model.lr_decay,
            conv_cond=conv_cond,
            model=model.__class__.__name__,
            number_variables=model.get_no_trainable_variables(),
            max_epochs=epochs,
            optimizer=model.optimizer)

        # Pre-training
        train_data, val_data, test_data = model.data.train_trees, model.data.val_trees, model.data.test_trees
        while not summary.dropping() and not summary.interrupt():
            summary.epoch_inc()
            helper._print_subheader(
                f'Epoch {summary.get_epoch()} (Pre-training)')

            trainer.train(train_data)

            summary.compute(summary.VAL,
                            data=model.data.val_trees,
                            model=model,
                            _print=True)

            summary.save_history()
            summary.time_tick()

            if summary.new_best_acc(summary.VAL):
                helper._print("New best val model found!")
                model.save_best(sess, saver, summary.VAL)

            if summary.new_best_acc(summary.TRAIN):
                helper._print("New best train model found!")
                model.save_best(sess, saver, summary.TRAIN)
            else:
                helper._print(
                    "No new best model found!!! Prev best training acc:",
                    summary.best_acc[summary.TRAIN])
            # summary.dropping_tick()
            summary.save_speed()
            summary.pre_tick()

        # todo maybe allow multiple repeat selective training
        # Selecting
        helper._print_header('PRETRAINING ENDED!')
        model.load_best(sess, saver, summary.TRAIN)
        model.save_pre_end(sess, saver, summary.TRAIN)

        # Main training
        first = True
        while not summary.converging() and not summary.interrupt():
            summary.main_count_tick()

            if summary.re_cluster():
                # if main_count == 1 or (FLAGS.use_multi_cluster and main_count % int(FLAGS.pretrain_max_epoch/4)==0):
                helper._print_header(f'Clustering for MAIN TRAINING!')
                train_data_selection, cluster_predictions = selector.select_data(
                    model.data.train_trees, FLAGS.selection_cut_off)
                summary.save_cluster_predictions(cluster_predictions)
                summary.time_tick("Selection time:")

            elif first and FLAGS.load_model:
                cluster_predictions = summary.load_cluster_predictions()
                train_data_selection, cluster_predictions = selector.select_data(
                    model.data.train_trees,
                    FLAGS.selection_cut_off,
                    cluster_predictions=cluster_predictions)
                first = False

            summary.epoch_inc()

            helper._print_subheader(
                f'Epoch {summary.get_epoch()} (Main training)')
            helper._print(
                f'Using {len(train_data_selection)}/{len(train_data)} ({len(train_data_selection)/len(train_data)*100}%) for training data.'
            )

            trainer.train(train_data_selection)

            summary.compute(summary.VAL,
                            data=model.data.val_trees,
                            model=model,
                            _print=True)
            summary.save_history()
            summary.time_tick()

            if summary.new_best_acc(summary.VAL):
                helper._print("New best model found!")
                model.save_best(sess, saver, summary.VAL)
            else:
                helper._print(
                    "No new best model found!!! Prev best validation acc:",
                    summary.best_acc[summary.VAL])
            summary.converging_tick()
            summary.save_speed()

        model.load_best(sess, saver, summary.VAL)
        summary.save_performance(model)
        summary.print_performance()
Пример #21
0
    def __init__(self, data):
        """
        :param data: utils.data
        """
        helper._print_header("Constructing treeRNN constants, placeholders and variables")

        # Setup data
        self.data = data  # TODO: Make data
        self.embeddings = tf.constant(data.word_embed_util.embeddings)

        # constants
        # leaf constant output
        o_none = tf.constant(-1.0, shape=[FLAGS.label_size, 1])
        # loss weight constant w>1 more weight on sensitive loss
        self.weight = tf.constant(FLAGS.sensitive_weight)

        # tree structure placeholders
        self.root_array = tf.placeholder(tf.int32, (None), name='root_array')
        self.is_leaf_array = tf.placeholder(tf.bool, (None), name='is_leaf_array')
        self.word_index_array = tf.placeholder(tf.int32, (None), name='word_index_array')
        self.left_child_array = tf.placeholder(tf.int32, (None), name='left_child_array')
        self.right_child_array = tf.placeholder(tf.int32, (None), name='right_child_array')
        self.label_array = tf.placeholder(tf.int32, (None, FLAGS.label_size), name='label_array')

        # initializers
        xavier_initializer = tf.contrib.layers.xavier_initializer()

        weight_initializer = xavier_initializer
        if FLAGS.weight_initializer == "identity":
            def custom_initializer(shape_list, dtype, partition_info):
                return tf.initializers.identity(gain=0.5)(shape_list, dtype,
                                                          partition_info) + tf.initializers.random_uniform(minval=-0.05,
                                                                                                           maxval=0.05)(
                    shape_list, dtype, partition_info)

            weight_initializer = custom_initializer

        bias_initializer = xavier_initializer
        if FLAGS.bias_initializer == "zero":
            bias_initializer = tf.initializers.zeros()

        # encoding variables
        W_l = tf.get_variable(name='W_l', shape=[FLAGS.sentence_embedding_size, FLAGS.word_embedding_size],
                              initializer=weight_initializer)
        W_r = tf.get_variable(name='W_r', shape=[FLAGS.sentence_embedding_size, FLAGS.word_embedding_size],
                              initializer=weight_initializer)
        self.W_l = W_l
        self.W_r = W_r

        # phrase weights
        U_l = tf.get_variable(name='U_l', shape=[FLAGS.sentence_embedding_size, FLAGS.sentence_embedding_size],
                              initializer=weight_initializer)
        U_r = tf.get_variable(name='U_r', shape=[FLAGS.sentence_embedding_size, FLAGS.sentence_embedding_size],
                              initializer=weight_initializer)
        self.U_l = U_l
        self.U_r = U_r

        # bias
        b_W = tf.get_variable(name='b_W', shape=[FLAGS.sentence_embedding_size, 1], initializer=bias_initializer)
        b_U = tf.get_variable(name='b_U', shape=[FLAGS.sentence_embedding_size, 1], initializer=bias_initializer)

        # classifier weights
        V = tf.get_variable(name='V', shape=[FLAGS.label_size, FLAGS.sentence_embedding_size],
                            initializer=xavier_initializer)
        b_p = tf.get_variable(name='b_p', shape=[FLAGS.label_size, 1], initializer=bias_initializer)
        self.V = V
        self.b_p = b_p

        helper._print_header("Constructing tRNN structure")

        # phrase node tensors
        rep_array = tf.TensorArray(
            tf.float32,
            size=0,
            dynamic_size=True,
            clear_after_read=False,
            infer_shape=False)

        o_array = tf.TensorArray(
            tf.float32,
            size=0,
            dynamic_size=True,
            clear_after_read=False,
            infer_shape=False)

        helper._print_header("Building tRNN tree structure")

        # build the tRNN structure
        def embed_word(word_index):
            return tf.nn.embedding_lookup(self.embeddings, word_index)

        def build_node(left_child, right_child, rep_array):
            left_is_leaf = tf.gather(self.is_leaf_array, left_child)
            right_is_leaf = tf.gather(self.is_leaf_array, right_child)

            # reshape from vector to matrix with height 300 and width 1
            rep_l = tf.reshape(rep_array.read(left_child), [300, 1])
            rep_r = tf.reshape(rep_array.read(right_child), [300, 1])

            left = tf.cond(
                left_is_leaf,
                lambda: tf.matmul(W_l, rep_l) + b_W,
                lambda: tf.matmul(U_l, rep_l) + b_U
            )

            right = tf.cond(
                right_is_leaf,
                lambda: tf.matmul(W_r, rep_r) + b_W,
                lambda: tf.matmul(U_r, rep_r) + b_U
            )

            # relu( (sent_size , 1) + (sent_size , 1) + (sent_size , 1) )  = (sent_size , 1)
            return tf.nn.leaky_relu(left + right)

        def tree_construction_body(rep_array, o_array, i):
            # gather variables
            is_leaf = tf.gather(self.is_leaf_array, i)
            word_index = tf.gather(self.word_index_array, i)
            left_child = tf.gather(self.left_child_array, i)
            right_child = tf.gather(self.right_child_array, i)

            # embed_word = (word_size, 1)
            # build_node = (sent_size , 1)
            rep = tf.cond(
                is_leaf,
                lambda: embed_word(word_index),
                lambda: build_node(left_child, right_child, rep_array)
            )
            rep_array = rep_array.write(i, rep)

            # o_none = (label_size, 1)
            # softmax( (label_size, sent_size) * (sent_size, 1) + (label_size, 1)) = (label_size, 1)
            o = tf.cond(
                is_leaf,
                lambda: o_none,
                lambda: tf.matmul(V, rep) + b_p  # TODO maybe with out activation function
            )
            o_array = o_array.write(i, o)

            i = tf.add(i, 1)
            return rep_array, o_array, i

        termination_cond = lambda rep_a, o_a, i: tf.less(i, tf.squeeze(tf.shape(self.is_leaf_array)))

        tf.print('hello', (self.is_leaf_array), output_stream=sys.stderr)

        self.rep_array, self.o_array, _ = tf.while_loop(
            cond=termination_cond,
            body=tree_construction_body,
            loop_vars=(rep_array, o_array, 0),
            parallel_iterations=1
        )

        self.loss = self.get_loss()
        self.acc = self.get_acc_batch()
        self.global_step = tf.train.create_global_step()

        if FLAGS.lr_decay:
            n = int(len(self.data.train_trees) / FLAGS.batch_size)
            total_steps = FLAGS.epochs * n
            decay_steps = n
            decay_rate = (FLAGS.learning_rate_end / FLAGS.learning_rate) ** (decay_steps / total_steps)
            self.learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, self.global_step, decay_steps,
                                                            decay_rate,
                                                            name='learning_rate')

            helper._print_header("Using learning rate with exponential decay")
            helper._print("Decay for every step:", decay_rate)
            helper._print("Learning rate start:", FLAGS.learning_rate)
            helper._print("Learning rate end:", FLAGS.learning_rate_end)
            helper._print("After number of epochs", FLAGS.epochs)
        else:
            self.learning_rate = tf.constant(FLAGS.learning_rate)

        if FLAGS.optimizer == constants.ADAM_OPTIMIZER:
            self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
        else:  # FLAGS.optimizer == constants.ADAGRAD_OPTIMIZER:
            self.train_op = tf.train.AdagradOptimizer(self.learning_rate).minimize(self.loss,
                                                                                   global_step=self.global_step)
        self.init = tf.global_variables_initializer()

        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('accuracy', self.acc)
        self.merged_summary_op = tf.summary.merge_all()
Пример #22
0
def cross_validation():
    data = get_data()
    x_train, y_train = data['train']
    x_val, y_val = data['val']
    x_test, y_test = data['test']

    helper._print_header('Searching the parameter space')
    params = {
        'lr': [0.1, 0.01],
        'optimizer': [Adagrad],
        'activation': [relu],
        'dropout': [0, 0.2, 0.5],
        'regularization': [0, 0.01, 0.001],
        # 'weights1': [1, 2],
        # 'weights2': [1],
        'loss_functions': ['categorical_crossentropy'],
        'layers': [1, 3],
        'layer_size': [100, 300],
        'batch_size': [4],
    }
    paramsTest = {
        'lr': [0.1],
        'optimizer': [Adagrad],
        'activation': [relu],
        'dropout': [0.2],
        'regularization': [0.01],
        # 'weights1': [2],
        # 'weights2': [1],
        'loss_functions': ['categorical_cross_entropy'],
        'layers': [3],
        'layer_size': [100],
        'batch_size': [64, 4],
    }
    ta.Scan(
        model=mlp_model,
        x=x_train,
        y=y_train,
        x_val=x_val,
        y_val=y_val,
        params=params,
        dataset_name=FLAGS.model_name,
        experiment_no='Adagrad_V1',
        clear_tf_session=False,
        print_params=False,
    )
    ta.Scan(
        model=mlp_model,
        x=x_train,
        y=y_train,
        x_val=x_val,
        y_val=y_val,
        params=params,
        dataset_name=FLAGS.model_name,
        experiment_no='Adagrad_V2',
        clear_tf_session=False,
        print_params=False,
    )
    ta.Scan(
        model=mlp_model,
        x=x_train,
        y=y_train,
        x_val=x_val,
        y_val=y_val,
        params=params,
        dataset_name=FLAGS.model_name,
        experiment_no='Adagrad_V3',
        clear_tf_session=False,
        print_params=False,
    )
Пример #23
0
    def __init__(self, data):
        """
        :param data: utils.data
        """
        helper._print_header(
            "Constructing treeRNN friendly constants, placeholders and variables"
        )

        # Setup data
        self.data = data  # TODO: Make data
        self.embeddings = tf.constant(data.word_embed_util.embeddings)

        # constants dasdsa
        # leaf constant output
        self.rep_zero = tf.constant(0.,
                                    shape=[FLAGS.sentence_embedding_size, 1])
        self.word_zero = tf.constant(0., shape=[FLAGS.word_embedding_size, 1])
        self.label_zero = tf.constant(0., shape=[FLAGS.label_size, 1])

        # loss weight constant w>1 more weight on sensitive loss
        self.weight = tf.constant(FLAGS.sensitive_weight)

        # tree structure placeholders
        self.root_array = tf.placeholder(tf.int32, (None), name='root_array')
        self.is_leaf_array = tf.placeholder(tf.bool, (None, None),
                                            name='is_leaf_array')
        self.word_index_array = tf.placeholder(tf.int32, (None, None),
                                               name='word_index_array')
        self.left_child_array = tf.placeholder(tf.int32, (None, None),
                                               name='left_child_array')
        self.right_child_array = tf.placeholder(tf.int32, (None, None),
                                                name='right_child_array')
        self.label_array = tf.placeholder(tf.int32,
                                          (None, None, FLAGS.label_size),
                                          name='label_array')

        # initializers
        xavier_initializer = tf.contrib.layers.xavier_initializer()

        weight_initializer = xavier_initializer
        if FLAGS.weight_initializer == "identity":

            def custom_initializer(shape_list, dtype, partition_info):
                return tf.initializers.identity(gain=0.5)(
                    shape_list, dtype,
                    partition_info) + tf.initializers.random_uniform(
                        minval=-0.05, maxval=0.05)(shape_list, dtype,
                                                   partition_info)

            weight_initializer = custom_initializer

        bias_initializer = xavier_initializer
        if FLAGS.bias_initializer == "zero":
            bias_initializer = tf.initializers.zeros()

        # encoding variables
        W = tf.get_variable(
            name='W',
            shape=[FLAGS.sentence_embedding_size, FLAGS.word_embedding_size],
            initializer=weight_initializer)
        self.W = W

        # phrase weights
        U_l = tf.get_variable(name='U_l',
                              shape=[
                                  FLAGS.sentence_embedding_size,
                                  FLAGS.sentence_embedding_size
                              ],
                              initializer=weight_initializer)
        U_r = tf.get_variable(name='U_r',
                              shape=[
                                  FLAGS.sentence_embedding_size,
                                  FLAGS.sentence_embedding_size
                              ],
                              initializer=weight_initializer)
        self.U_l = U_l
        self.U_r = U_r
        self.weights = tf.concat([W, U_l, U_r], axis=1)

        # bias
        self.b = tf.get_variable(name='b',
                                 shape=[FLAGS.sentence_embedding_size, 1],
                                 initializer=bias_initializer)

        # classifier weights
        V = tf.get_variable(
            name='V',
            shape=[FLAGS.label_size, FLAGS.sentence_embedding_size],
            initializer=xavier_initializer)
        b_p = tf.get_variable(name='b_p',
                              shape=[FLAGS.label_size, 1],
                              initializer=bias_initializer)
        self.V = V
        self.b_p = b_p

        helper._print_header("Constructing tRNN structure")

        # phrase node tensors
        rep_array = tf.TensorArray(tf.float32,
                                   size=0,
                                   dynamic_size=True,
                                   clear_after_read=False,
                                   infer_shape=False)
        rep_array = rep_array.write(0, self.rep_zero)

        o_array = tf.TensorArray(tf.float32,
                                 size=0,
                                 dynamic_size=True,
                                 clear_after_read=False,
                                 infer_shape=False)
        o_array = o_array.write(0, self.label_zero)

        word_array = tf.TensorArray(tf.float32,
                                    size=0,
                                    dynamic_size=True,
                                    clear_after_read=False,
                                    infer_shape=False)
        word_array = word_array.write(0, self.word_zero)

        helper._print_header("Building tRNN tree structure")

        batch_indices = [[i, i] for i in range(FLAGS.batch_size)]

        def gather_rep(step, children_indices, rep_a):
            children = tf.squeeze(tf.gather(children_indices, step, axis=1))
            return tf.gather_nd(rep_a.gather(children), batch_indices)

        # build the tRNN structure
        def embed_word(word_index):
            return tf.nn.embedding_lookup(self.embeddings, word_index)
            # return tf.cond(
            #     is_leaf,
            #     lambda: tf.nn.embedding_lookup(self.embeddings, word_index),
            #     lambda: self.word_zero
            # )

        def build_node(i, rep_array, word_array):

            # reshape from vector to matrix with height 300 and width 1
            print_op = tf.print("i:",
                                i,
                                "right children:",
                                tf.squeeze(
                                    tf.gather(self.left_child_array, i,
                                              axis=1)),
                                output_stream=sys.stdout)
            with tf.control_dependencies([print_op]):
                rep_l = gather_rep(i, self.left_child_array, rep_array)
            rep_r = gather_rep(i, self.right_child_array, rep_array)
            rep_word = word_array.read(i)

            left = tf.matmul(rep_l, self.U_l)
            right = tf.matmul(rep_r, self.U_r)
            word = tf.matmul(rep_word, self.W)

            return tf.nn.leaky_relu(word + left + right + self.b)

        def tree_construction_body(rep_array, word_array, o_array, i):
            # gather variables
            word_index = tf.gather(self.word_index_array, i)

            # embed_word = (word_size, 1)
            word_emb = embed_word(word_index)
            word_array = word_array.write(i, word_emb)

            # build_node = (sent_size , 1)
            rep = build_node(i, rep_array, word_array)
            rep_array = rep_array.write(i, rep)

            o = tf.matmul(V, rep) + b_p
            o_array = o_array.write(i, o)

            i = tf.add(i, 1)
            return rep_array, word_array, o_array, i

        termination_cond = lambda rep_a, word_a, o_a, i: tf.less(
            i, tf.gather(tf.shape(self.is_leaf_array), 1))

        self.rep_array, self.word_array, self.o_array, _ = tf.while_loop(
            cond=termination_cond,
            body=tree_construction_body,
            loop_vars=(rep_array, word_array, o_array, 1),
            parallel_iterations=1)

        self.loss = self.get_loss()
        self.acc = self.get_acc_batch()
        self.global_step = tf.train.create_global_step()

        if FLAGS.lr_decay:
            n = int(len(self.data.train_trees) / FLAGS.batch_size)
            total_steps = FLAGS.epochs * n
            decay_steps = n
            decay_rate = (FLAGS.learning_rate_end /
                          FLAGS.learning_rate)**(decay_steps / total_steps)
            self.learning_rate = tf.train.exponential_decay(
                FLAGS.learning_rate,
                self.global_step,
                decay_steps,
                decay_rate,
                name='learning_rate')

            helper._print_header("Using learning rate with exponential decay")
            helper._print("Decay for every step:", decay_rate)
            helper._print("Learning rate start:", FLAGS.learning_rate)
            helper._print("Learning rate end:", FLAGS.learning_rate_end)
            helper._print("After number of epochs", FLAGS.epochs)
        else:
            self.learning_rate = FLAGS.learning_rate

        if FLAGS.optimizer == constants.ADAM_OPTIMIZER:
            self.train_op = tf.train.AdamOptimizer(
                self.learning_rate).minimize(self.loss,
                                             global_step=self.global_step)
        else:  # FLAGS.optimizer == constants.ADAGRAD_OPTIMIZER:
            self.train_op = tf.train.AdagradOptimizer(
                self.learning_rate).minimize(self.loss,
                                             global_step=self.global_step)
        self.init = tf.global_variables_initializer()

        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('accuracy', self.acc)
        self.merged_summary_op = tf.summary.merge_all()
Пример #24
0
 def build_finetuned_embeddings(self):
     helper._print_header('Getting fine-tuned GloVe embeddings')
     self.glove_download_pretrained_model()
     vocab, _, _ = self.train_and_save_finetuned_embeddings()
     return self.generate_indexes(
         vocab, directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH)
Пример #25
0
 def on_epoch_begin(self, model):
     helper._print_header(f'Epoch: {self.epoch}/{model.iter}')
     self.epoch += 1
Пример #26
0
    def train_old(self):
        helper._print_header("Training tRNN")
        helper._print("Test ration:",
                      tree_util.ratio_of_labels(self.data.test_trees))
        helper._print("Validation ration:",
                      tree_util.ratio_of_labels(self.data.val_trees))
        helper._print("Train ration:",
                      tree_util.ratio_of_labels(self.data.train_trees))

        # todo make a flag for this
        config = tf.ConfigProto(device_count={'GPU': 0})

        with tf.Session(config=config) as sess:
            model_placement = FLAGS.models_dir + FLAGS.model_name + "model.ckpt"

            # Summary writer for both the training and the set acc and loss - used for tensorboard
            self.make_needed_dir()
            directory = FLAGS.logs_dir + FLAGS.model_name
            train_writer = tf.summary.FileWriter(directory + 'train',
                                                 sess.graph)
            validation_writer = tf.summary.FileWriter(directory + 'validation')
            test_writer = tf.summary.FileWriter(directory + 'test')

            history = self.get_history()
            starting_steps = 0
            best_acc = 0

            # Run the init
            saver = tf.train.Saver()
            self.run_tensorboard()
            if FLAGS.load_model:
                history, starting_steps, best_acc = self.load_history()
                helper._print("Previously", starting_steps,
                              "steps has been ran, best acc was:", best_acc)

                self.load_model(sess, model_placement, saver)
                self.write_history_to_summary(history, train_writer,
                                              validation_writer, test_writer)
                sess.run(tf.assign(self.global_step, starting_steps))
            else:
                sess.run(self.init)
                self.handle_val_test(history, sess, test_writer, 0,
                                     validation_writer)

            start_time = time.time()
            loss_total = 0
            acc_total = 0
            for epoch in range(FLAGS.epochs):
                helper._print_header("Epoch " + str(epoch + 1))

                batch_size = (FLAGS.batch_size if epoch >= 10 else 1)

                print_interval = FLAGS.print_step_interval / batch_size
                for step, tree in enumerate(
                        helper.batches(
                            np.random.permutation(self.data.train_trees),
                            batch_size)):  # todo build train get_trees
                    if step % int(print_interval) == 0:
                        total_step = starting_steps + epoch * int(
                            len(self.data.train_trees)) + step * batch_size
                        helper._print("Step:", total_step)
                        helper._print("Learning rate:",
                                      sess.run(self.learning_rate))

                        avg_acc = acc_total / print_interval
                        avg_loss = loss_total / print_interval
                        if epoch != 0 or step != 0:
                            self.write_to_summary(avg_acc, avg_loss,
                                                  total_step, train_writer)
                            helper._print("Train -  acc:", avg_acc, "loss:",
                                          avg_loss)
                            history["train"].append(
                                (total_step, avg_acc, avg_loss))

                            val_acc = self.handle_val_test(
                                history, sess, test_writer, total_step,
                                validation_writer)

                            loss_total = 0
                            acc_total = 0

                            if val_acc > best_acc:
                                best_acc = val_acc
                                helper._print("A better model was found!")

                                saver.save(sess, model_placement)

                                np.savez(FLAGS.histories_dir +
                                         FLAGS.model_name + 'history.npz',
                                         train=history["train"],
                                         test=history["test"],
                                         val=history["val"],
                                         total_steps=total_step,
                                         best_acc=best_acc)

                                helper._print("Model saved!")

                    feed_dict = self.build_feed_dict_batch(
                        tree)  # todo maybe change to batches
                    _, acc, loss = sess.run(
                        [self.train_op, self.acc, self.loss],
                        feed_dict=feed_dict)

                    acc_total += acc
                    loss_total += loss

                helper._print("Avg Epoch Time:",
                              (time.time() - start_time) / (epoch + 1) / 60,
                              "m")