示例#1
0
 def load_embeddings(sess, model, metadata, path):
     tv_dict = {
         name: variable
         for name, variable in [(v.name, v)
                                for v in tf.trainable_variables()]
     }
     if model.mode == ATTENTION_MODE or \
        model.mode == ATTENTION_DIVERSITY_MODE:
         name1 = 'embedding_attention_seq2seq'
         name2 = 'embedding_attention_decoder'
     else:
         name1 = 'embedding_rnn_seq2seq'
         name2 = 'embedding_rnn_decoder'
     emb1 = tv_dict[
         'decoder/%s/rnn/embedding_wrapper/embedding:0' % name1]
     emb2 = tv_dict[
         'decoder/%s/%s/embedding:0' % (name1, name2)]
     emb1_val = sess.run(emb1)
     emb2_val = sess.run(emb2)
     print(np.sum(emb1_val), np.sum(emb2_val))
     load_embeddings.load_embedding(
         sess,
         metadata['w2idx'], [emb1, emb2],
         path,
         dim_embedding=300,
         vocab_length=len(metadata['w2idx']))
     emb1_val = sess.run(emb1)
     emb2_val = sess.run(emb2)
     print(np.sum(emb1_val), np.sum(emb2_val))
    def _embeddings(self, pretrained=False, scope_name=None):
        """Compute word embeddings for sentence.

        Parameters
        ----------
        pretrained: bool, default False
            Whether to use pretrained embeddings
        scope_name: str, default None
            Variable scope
        """
        if not scope_name:
            scope_name = "Embedding"

        self.sentence_ph = tf.placeholder(dtype=tf.int32, shape=[None, self.time_steps + 1],
                                        name="Sentence_placeholder")

        with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):
            self.embedding_matrix = tf.get_variable(
                name="embedding_matrix",
                shape=[self.len_corpus, self.embedding_size],
                initializer=xav_init()
            )

            if pretrained:
                print("Loading pretrained embeddings...")
                load_embedding(session=self.session,
                               vocab=self.dataset.word_to_idx,
                               emb=self.embedding_matrix,
                               path=self.dataset.embedding_file,
                               vocab_size=self.len_corpus,
                               dim_embedding=self.embedding_size)

            self.word_embeddings = tf.nn.embedding_lookup(self.embedding_matrix,
                                                          self.sentence_ph)
示例#3
0
def main():
    # Write to both logfile and stdout
    timestamp = time.strftime('%Y-%m-%d--%H_%M_%S')
    sys.stdout = Logger(timestamp)

    # Read train data
    train_reader = Reader(vocab_size=cfg["vocab_size"],
                          sentence_length=cfg["sentence_length"],
                          max_sentences=cfg["max_sentences"])
    train_reader.build_dict(cfg["dictionary_name"], cfg["path"]["train"])
    train_reader.read_sentences(cfg["path"]["train"])

    if cfg["use_pretrained"]:
        # Read given embeddings
        sess = tf.Session()
        embeddings = tf.placeholder(
            dtype=tf.float32,
            shape=[cfg["vocab_size"], cfg["embeddings_size"]])
        embeddings_blank = tf.Variable(
            dtype=tf.float32,
            initial_value=np.zeros(shape=(cfg["vocab_size"],
                                          cfg["embeddings_size"])))
        embeddings = load_embeddings.load_embedding(
            session=sess,
            vocab=train_reader.vocab_dict,
            emb=embeddings_blank,
            path=cfg["path"]["embeddings"],
            dim_embedding=cfg["embeddings_size"])
        m = model.Model(cfg=cfg, embeddings=embeddings)
    else:
        m = model.Model(cfg=cfg)

    # Training
    m.build_forward_prop()
    m.build_backprop()

    # Read evaluation data
    eval_reader = Reader(vocab_size=cfg["vocab_size"],
                         sentence_length=cfg["sentence_length"],
                         vocab_dict=train_reader.vocab_dict,
                         max_sentences=cfg["max_test_sentences"])
    eval_reader.read_sentences(cfg["path"]["eval"])

    m.train(train_data=train_reader.id_data, test_data=eval_reader.id_data)

    # Read test data
    test_reader = Reader(vocab_size=cfg["vocab_size"],
                         sentence_length=cfg["sentence_length"],
                         vocab_dict=train_reader.vocab_dict,
                         max_sentences=cfg["max_test_sentences"])
    test_reader.read_sentences(cfg["path"]["test"])

    #Revert dictionary for perplexity
    reverted_dict = dict([(y, x)
                          for x, y in list(test_reader.vocab_dict.items())])

    m.test(data=test_reader.id_data, vocab_dict=reverted_dict)
示例#4
0
def train_network(operators,
                  sentences_array,
                  num_epochs,
                  vocabulary,
                  configProto=None,
                  num_steps=MAX_SENTENCE_LENGTH,
                  batch_size=BATCH_SIZE,
                  state_size=CELL_SIZE,
                  checkpoint_filename=None,
                  useWord2Vec=False):
    """
        Trains the network using a given graph

        operators           Dictionary of graph operators to execute
        sentences_array     Input sentences (in index form)
        num_epochs          Number of epochs to train for
        vocabulary          Word2Index dictionary of our vocabulary
        configProto         Session configuration for tensorflow
        num_steps           Number of steps of our RNN / Sequence length
        batch_size          Number of sentences per batch
        state_size          Size of the hidden state in an RNN cell
        checkpoint_filename Name of the file to save the graph.
                            Set to None if no saving is required.
        useWord2Vec         Indicates whether to load word embeddings
                            from the provided word2vec file.
    """

    with tf.Session(config=configProto) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1)
        global_step = 1

        # Init Tensorboard summaries. This will save Tensorboard information into a different folder at each run.
        timestamp = str(math.trunc(time.time()))
        train_writer = tf.summary.FileWriter("{}{}-{}-training".format(
            LOG_DIRECTORY, timestamp, checkpoint_filename),
                                             graph=tf.get_default_graph())
        validation_writer = tf.summary.FileWriter("{}{}-{}-validation".format(
            LOG_DIRECTORY, timestamp, checkpoint_filename),
                                                  graph=tf.get_default_graph())
        '''
            If the flag is activated, load word2vec embeddings. It is placed just before training
            since it requires session object. Also, since it needs W_embed tensor, we reach it from "word_embedding" variable_scope.

            load_embeddings.py requires installing gensim package!
        '''
        if useWord2Vec == True:
            with tf.variable_scope("word_embedding"):
                tf.get_variable_scope().reuse_variables()
                W_embed = tf.get_variable("W_embed")
            load_embeddings.load_embedding(
                session=sess,
                vocab=vocabulary,
                emb=W_embed,
                path="wordembeddings-dim100.word2vec",
                dim_embedding=100,
                vocab_size=VOCABULARY_SIZE)

        # Setup the feed dictionary and fetch the operators to execute
        zero_state = np.zeros([batch_size, state_size])
        feed_dict = {
            operators['init_state_a']: zero_state,
            operators['init_state_b']: zero_state
        }  # For training we always feed zero states at the beginning of each sentence
        summary_op = operators['summary_op']
        train_step = operators['train_step']
        x = operators['x']
        y = operators['y']

        for epoch in range(num_epochs):
            print("Starting epoch {}".format(epoch))
            for X, Y in shuffle_iterator(sentences_array, batch_size,
                                         num_steps):
                global_step += 1

                feed_dict[x] = X
                feed_dict[y] = Y
                '''
                    Every PRINT_FREQUENCY-th batch is used for testing purposes only!
                    In other words,
                        - weights are not updated then
                        - current state of the graph is saved (this way, no metter when we kill the script in GPU VM, we have last previous
                            graph state saved)
                '''

                # Every VALIDATION_SUMMARY_FREQUENCY steps we test our network with validation batch
                # that has not been seen by the network.
                if global_step % VALIDATION_SUMMARY_FREQUENCY == 0:
                    summary_validation = sess.run(summary_op, feed_dict)
                    validation_writer.add_summary(summary_validation,
                                                  global_step)
                elif global_step % TRAIN_SUMMARY_FREQUENCY == 0:
                    # Every TRAIN_SUMMARY_FREQUENCY steps we also evaluate the summary on the current training batch
                    _, summary_train = sess.run([train_step, summary_op],
                                                feed_dict)
                    train_writer.add_summary(summary_train, global_step)
                else:
                    # In every other case, just train the network
                    sess.run([train_step], feed_dict)

                # Regularly we save the current model to disk
                if global_step % CHECKPOINT_FREQUENCY == 0:
                    if checkpoint_filename is not None:
                        saver.save(sess,
                                   "./{}-ep{}".format(checkpoint_filename,
                                                      epoch),
                                   global_step=global_step)

        # At the end of the whole training process, save the model to disk
        if checkpoint_filename is not None:
            saver.save(sess,
                       "./{}".format(checkpoint_filename),
                       global_step=global_step)
示例#5
0
def main():
    """load configs & data -> preprocessing"""


    max_predicted_words = 20

    """ PARAMETERS INTO TENSORFLOW FLAGS
        -> the advantage : Variables can be accessed from a tensorflow object without 
        explicitely passing them"""


    tf.flags.DEFINE_string("train_set", train_set, "Path to the training data")
    # Model parameters
    tf.flags.DEFINE_integer("embeddings_size", embeddings_size, "Dimensionality of word embeddings (default: 50)")
    tf.flags.DEFINE_integer("vocabulary_size", vocabulary_size, "Size of the vocabulary (default: 20k)")
    # tf.flags.DEFINE_integer("past_words", 3, "How many previous words are used for prediction (default: 3)")
    # Training parameters
    tf.flags.DEFINE_integer("batch_size", batch_size, "Batch Size (default: 64)")
    tf.flags.DEFINE_integer("num_epochs", num_epochs, "Number of training epochs (default: 200)")
    tf.flags.DEFINE_integer("evaluate_every", evaluate_every,
                            "Evaluate model on dev set after this many steps (default: 100)")
    tf.flags.DEFINE_integer("checkpoint_every", checkpoint_every, "Save model after this many steps (default: 100)")
    tf.flags.DEFINE_integer("num_checkpoints", num_checkpoints, "Number of checkpoints to store (default: 5)")
    tf.flags.DEFINE_integer("lstm_cell_state", lstm_cell_state, "Number of units inside the lastm cell")
    tf.flags.DEFINE_integer("lstm_cell_state_down", lstm_cell_state_down, "Number of units inside the lstm cell")

    # Tensorflow Parameters
    tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

    # for running on EULER, adapt this
    tf.flags.DEFINE_integer("inter_op_parallelism_threads", 4,
                            "TF nodes that perform blocking operations are enqueued on a pool of inter_op_parallelism_threads available in each process (default 0).")
    tf.flags.DEFINE_integer("intra_op_parallelism_threads", 4,
                            "The execution of an individual op (for some op types) can be parallelized on a pool of intra_op_parallelism_threads (default: 0).")

    """Printing model configuration to command line"""

    FLAGS = tf.flags.FLAGS
    # FLAGS._parse_flags()          # add if using tensorflow version <= 1.3

    print("\nParameters:")
    for attr, value in sorted(FLAGS.__flags.items()):
        print("{}={}".format(attr.upper(), value.value))
        # print("{}={}".format(attr.upper(), value))            # change to this if using tensorflow version <= 1.3
    print("")

    def train_step(x_batch, y_batch):
        """
        A single training step, x_batch = y_batch
        Both are matrices indices of words
        """

        feed_dict = {
            lstm_network.input_x: x_batch,
            lstm_network.input_y: y_batch,
            lstm_network.init_state_hidden: lstm_network.next_hidden_state,
            lstm_network.init_state_current: lstm_network.next_current_state
        }
        _, step, summaries, loss, accuracy, new_hidden_state, new_current_state, vocab_idx_predictions = sess.run(
            [train_optimizer, global_step, train_summary_op, lstm_network.loss,
             lstm_network.accuracy, lstm_network.init_state_hidden, lstm_network.init_state_current,
             lstm_network.vocab_indices_predictions],
            feed_dict)

        # print("Predictions indices w.r.t vocabulary")
        # print(vocab_idx_predictions)
        # print("Example of sentence predicted by the network by training")
        # print(train_utils.words_mapper_from_vocab_indices(vocab_idx_predictions, utils.vocabulary_words_list,
        #                                                 is_tuple=True)[0:28])
        # print("Groundtruth for the sentence predicted by the network above")
        # print(train_utils.words_mapper_from_vocab_indices(np.reshape(x_batch, [batch_size * 29]),
        #                                                  utils.vocabulary_words_list)[0:28])

        lstm_network.next_hidden_state = new_hidden_state
        lstm_network.next_current_state = new_current_state

        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
        train_summary_writer.add_summary(summaries, step)

    def predicting_step(word, state):

        """The input in this case is represented by a single word"""

        feed_dict = {
            lstm_network.input_x: word,
            init_state_hidden: state[0],
            init_state_current: state[1]
        }

        word_predicted, next_final_state = sess.run(
            [lstm_network.vocab_indices_predictions, lstm_network.final_lstm_state], feed_dict)

        """Word indices in vocabulary -> charachter words"""
        word_predicted = np.array(word_predicted).reshape((1, 1))

        # print(train_utils.words_mapper_from_vocab_indices(word_predicted, utils.vocabulary))

        """Update state in the lstm, which is the contextual memory"""
        next_hidden_state, next_current_state = next_final_state
        state = (next_hidden_state, next_current_state)
        # print(next_hidden_state)

        return word_predicted, state

    def eval_step(current_step):
        """
        Evaluates the model on eval_set
        """
        eval_dataset, vocabulary_words_list = data_utilities.data_utils(model_to_load, embeddings_size, sentence_len,
                                                                        vocabulary_size, bos,
                                                                        eos, pad, unk).load_eval_data(eval_set,
                                                                                                      vocabulary_pkl)

        batches = train_utils.batch_iter_train(data=eval_dataset, batch_size=test_batch_size, num_epochs=1,
                                         shuffle=False)
        perplexities = []  # array with perplexities for each sentence

        for i, batch in enumerate(batches):
            _, y_batch = zip(*batch)
            y_batch = train_utils.words_mapper_to_vocab_indices(y_batch, vocabulary_words_list)

            feed_dict = {
                lstm_network.input_x: y_batch,
                lstm_network.init_state_hidden: np.zeros([test_batch_size, lstm_cell_state]),
                lstm_network.init_state_current: np.zeros([test_batch_size, lstm_cell_state])
            }

            estimates = sess.run(lstm_network.softmax, feed_dict)
            estimates = np.reshape(estimates, [-1, sentence_len-1, vocabulary_size])

            for j, sentence in enumerate(y_batch):
                sentence_perplexity = eval.perplexity(sentence, estimates[j], vocabulary_words_list)
                print("Sentence {} in batch {}: perplexity {}".format(test_batch_size * i + j, i,
                                                                      sentence_perplexity))
                perplexities.append(sentence_perplexity)

        if eval_perpl_file:
            eval.write_perplexity(perplexities, eval_step=True, current_step=current_step)

        return

    if lstm_is_training:

        """Preprocess data"""
        utils = data_utilities.data_utils(model_to_load, embeddings_size, sentence_len, vocabulary_size, bos,
                                          eos, pad, unk)

        model_w2v, dataset = utils.load_train_data(train_set)
        # dataset=dataset[0:100]

        dataset_size = len(dataset)

        print("Total sentences in the dataset: ", dataset_size)
        print("Example of a random wrapped sentence in dataset ", dataset[(randint(0, dataset_size))])
        print("Example of the first wrapped sentence in dataset ", dataset[0])

        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                allow_soft_placement=FLAGS.allow_soft_placement,
                log_device_placement=FLAGS.log_device_placement,
                inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
                intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                # Initialize model
                lstm_network = model_lstm.lstm_model(
                    vocab_size=FLAGS.vocabulary_size,
                    embedding_size=FLAGS.embeddings_size,
                    words_in_sentence=sentence_len - 1,
                    lstm_cell_size=lstm_cell_state,
                    lstm_cell_size_down=lstm_cell_state_down,
                    down_project=down_project

                )
            """Please note that the tf variables keeps updated, ready to be printed out or
               logged to file"""

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer()
            train_optimizer = optimizer.minimize(lstm_network.loss, global_step=global_step)

            """ Output directory for models and summaries """
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, runs_dir, timestamp))
            print("Writing to {}\n".format(out_dir))

            """ Summaries for loss and accuracy """
            loss_summary = tf.summary.scalar("loss", lstm_network.loss)
            acc_summary = tf.summary.scalar("accuracy", lstm_network.accuracy)

            """ Train Summaries """
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # """ Dev summaries  """
            # dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            # dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            # dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

            """ Checkpoint directory (Tensorflow assumes this directory already exists so we need to create it) """
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

            sess.run(tf.global_variables_initializer())

            lstm_network.next_hidden_state = np.zeros([batch_size, lstm_cell_state])
            lstm_network.next_current_state = np.zeros([batch_size, lstm_cell_state])

            if training_with_w2v:
                Total_IDs = len(utils.vocabulary_words_list)
                vocab_and_IDs = dict(zip(utils.vocabulary_words_list, [idx for idx in range(Total_IDs)]))

                load_embeddings.load_embedding(session=sess, vocab=vocab_and_IDs, emb=lstm_network.W_embedding,
                                               path=embeddings, dim_embedding=embeddings_size,
                                               vocab_size=Total_IDs)

        """batches is a generator, please refer to training_utilities for more information.
           batch_iter function is executed if an iteration is performed on op of it and it
           gives a new batch each time (sequentially-wise w.r.t the original dataset)"""
        batches = train_utils.batch_iter_train(data=dataset, batch_size=batch_size, num_epochs=num_epochs,
                                               shuffle=shuffle_training,
                                               testing=False)

        for batch in batches:

            x_batch, y_batch = zip(*batch)

            x_batch = train_utils.words_mapper_to_vocab_indices(x_batch, utils.vocabulary_words_list)
            y_batch = train_utils.words_mapper_to_vocab_indices(y_batch, utils.vocabulary_words_list)

            """Train batch is used as evaluation batch as well -> it will be compared with predicitons"""
            train_step(x_batch=x_batch, y_batch=y_batch)
            current_step = tf.train.global_step(sess, global_step)

            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation: computing perplexities for the evaluation set...")
                eval_step(current_step)
                print("")

            if global_step == max_global_steps:
                sys.exit()

    else:

        """The network is doing predictions"""
        """Restore model for predictions"""

        lstm_network = model_lstm.lstm_model(
            vocab_size=FLAGS.vocabulary_size,
            embedding_size=FLAGS.embeddings_size,
            words_in_sentence=test_sentence_len,
            lstm_cell_size=lstm_cell_state,
            lstm_cell_size_down=lstm_cell_state_down,
            down_project=down_project
        )

        out_dir = os.path.abspath(os.path.join(os.path.curdir, runs_dir))
        all_runs = [os.path.join(out_dir, o) for o in os.listdir(out_dir)
                    if os.path.isdir(os.path.join(out_dir, o))]
        latest_run = max(all_runs, key=os.path.getmtime)  # get the latest run
        checkpoint_dir = os.path.abspath(os.path.join(latest_run, "checkpoints"))

        checkpoint_prefix = tf.train.latest_checkpoint(checkpoint_dir)




        with tf.Session() as sess:


            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver(max_to_keep=5)
            saver.restore(sess, checkpoint_prefix)



            input_x = tf.get_default_graph().get_tensor_by_name("input_x:0")

            print(input_x)

            init_state_current = tf.get_default_graph().get_tensor_by_name("init_state_current:0")
            print(init_state_current)
            init_state_hidden = tf.get_default_graph().get_tensor_by_name("init_state_hidden:0")
            vocab_indices_predictions = tf.get_default_graph().get_tensor_by_name("vocab_indices_predictions:0")
            print(vocab_indices_predictions)
            print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'softmax_out_layer'))
            # arr=a.split(" ")
            print("YESSSSSS")

            """Load test data"""
            utils = data_utilities.data_utils(model_to_load, embeddings_size, max_predicted_words, vocabulary_size, bos,
                                              eos, pad, unk)

            dataset, _ = utils.load_test_data(path_to_file=cont_set, vocabulary_file_path=vocabulary_pkl)
            #dataset = dataset[0:100]
            print(len(dataset))
            # dataset=dataset[0:50]  uncomment for testing and have results in the brief time
            dataset_size = len(dataset)

            complete_sentences = []

            sentence_nb = 0

            """Zero state feeded initially for each sentence"""
            for sentence in dataset:
                index = sentence.index(eos)
                sentence = sentence[1:index]
                print("SENTENCE IS ", sentence)

                nb_initial_words = len(sentence)
                print("INITIAL SENTECE IS LONG ", nb_initial_words)
                initial_lstm_state = (np.zeros((1, lstm_cell_state)),) * 2

                lstm_state = initial_lstm_state
                full_sentence = []

                for word in sentence:

                    if word == eos:
                        print("FOUND", word)
                        break
                    word = np.array(utils.vocabulary_words_list.index(word)).reshape(1, 1)
                    word_predicted, lstm_state = predicting_step(word, lstm_state)

                    mapped_word = utils.vocabulary_words_list[word_predicted[0][0]]

                    full_sentence.append(utils.vocabulary_words_list[word[0][0]])
                    print(mapped_word)

                    if mapped_word == eos:
                        break
                print("Sentence before continuation is ", full_sentence)
                
                """Futher predictions done through the last predicted word of lstm and the current lstm state"""
                words_remaining = max_predicted_words - nb_initial_words
                print("max_predicted_words ", max_predicted_words)
                print("nb_initial_words ", nb_initial_words)
                states = []
                if full_sentence[-1] != eos:
                    print("Words remaining ", words_remaining)
                    for i in range(words_remaining):

                        last_word_predicted = full_sentence[-1]
                        # print(full_sentence[-1])
                        last_word_predicted = np.array(utils.vocabulary_words_list.index(last_word_predicted)).reshape(
                            1, 1)

                        word_predicted, lstm_new_state = predicting_step(last_word_predicted, lstm_state)

                        lstm_state = lstm_new_state
                        mapped_word = utils.vocabulary_words_list[word_predicted[0][0]]
                        full_sentence.append(mapped_word)

                        if mapped_word == eos:
                            break
                    print("SENTENCE WAS LONG ", len(full_sentence))
                    print("FULL PREDICTION ", full_sentence)
                sentence_nb = sentence_nb + 1
                complete_sentences.append(full_sentence)
                print("Completed sentence number ", sentence_nb)
            """Write predictions to submission file"""
            testing_utils.write_submission_predictions(complete_sentences, bos, eos, n_group)