def construct_graph(self, sess):
        with sess.graph.as_default():
      
            # Set the random seed for tensorflow
            tf.set_random_seed(cfg.RNG_SEED)
            # Build the main computation graph
            layers = self.net.create_architecture(True) # is_training flag: True
            # Define the loss
            loss = layers['total_loss']
            path_iter = self.pretrained_model.split('.ckpt')[0]
            iter_num = path_iter.split('_')[-1]
            # from iter_ckpt
            if cfg.TRAIN_MODULE_CONTINUE == 1:
                global_step    = tf.Variable(int(iter_num), trainable=False)
            # from iter 0
            elif cfg.TRAIN_MODULE_CONTINUE == 2:
                global_step    = tf.Variable(0, trainable=False)

            first_decay_steps = 2 * len(self.Trainval_GT) # need test, 2 epoches
            lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10, global_step, first_decay_steps, t_mul=2.0, m_mul=1.0, alpha=0.0) 
            self.optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)
            list_var_to_update = tf.trainable_variables()
            for var in list_var_to_update:
                print(var.name)
            grads_and_vars = self.optimizer.compute_gradients(loss, list_var_to_update)
            capped_gvs     = [(tf.clip_by_norm(grad, 1.), var) for grad, var in grads_and_vars]
            train_op = self.optimizer.apply_gradients(capped_gvs,global_step=global_step)
            
            self.saver = tf.train.Saver(max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT)
            
            # Write the train and validation information to tensorboard
            self.writer = tf.summary.FileWriter(self.tbdir, sess.graph)

        return lr, train_op
 def testDecay(self):
   num_training_steps = 1000
   initial_lr = 1.0
   for step in range(0, 1500, 250):
     decayed_lr = learning_rate_decay.cosine_decay_restarts(
         initial_lr, step, num_training_steps)
     expected = self.np_cosine_decay_restarts(step, num_training_steps)
     self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
Exemplo n.º 3
0
 def testDecay(self):
     num_training_steps = 1000
     initial_lr = 1.0
     for step in range(0, 1500, 250):
         decayed_lr = learning_rate_decay.cosine_decay_restarts(
             initial_lr, step, num_training_steps)
         expected = self.np_cosine_decay_restarts(step, num_training_steps)
         self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
    def construct_graph(self, sess):
        with sess.graph.as_default():

            # Set the random seed for tensorflow
            tf.set_random_seed(cfg.RNG_SEED)

            # Build the main computation graph
            layers = self.net.create_architecture(True)  # is_training flag: True

            # Define the loss
            loss = layers['total_loss']

            path_iter = self.pretrained_model.split('.ckpt')[0]
            iter_num = path_iter.split('_')[-1]

            # from iter_ckpt
            if cfg.TRAIN_MODULE_CONTINUE == 1:
                global_step = tf.Variable(int(iter_num), trainable=False)

            # from iter 0
            if cfg.TRAIN_MODULE_CONTINUE == 2:
                global_step = tf.Variable(0, trainable=False)

            # lr             = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10, global_step, cfg.TRAIN.STEPSIZE * 5, cfg.TRAIN.GAMMA, staircase=True)
            # here we use cos lr scheme, i.e. 
            first_decay_steps = 2 * len(self.Trainval_GT)  # 2 epoches
            lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10, global_step, first_decay_steps, t_mul=2.0,
                                       m_mul=1.0, alpha=0.0)
            self.optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)

            # list_var_to_update = []
            # if cfg.TRAIN_MODULE_UPDATE == 1:
            #     list_var_to_update = tf.trainable_variables()
            # if cfg.TRAIN_MODULE_UPDATE == 2:
            #     list_var_to_update = [var for var in tf.trainable_variables() if 'fc_binary' in var.name or 'binary_classification' in var.name]

            # 1--Update_all_parameter, 2--Only_Update_D, 3--Update_H+O+SP, 4--updating except classifiers of S(fc)
            list_var_to_update = []
            if cfg.TRAIN_MODULE_UPDATE == 1:
                list_var_to_update = tf.trainable_variables()
            if cfg.TRAIN_MODULE_UPDATE == 2:
                list_var_to_update = [var for var in tf.trainable_variables() if
                                      'fc_binary' in var.name or 'binary_classification' in var.name]
            if cfg.TRAIN_MODULE_UPDATE == 3:
                list_var_to_update = [var for var in tf.trainable_variables() if
                                      'fc_binary' not in var.name or 'binary_classification' not in var.name]
            if cfg.TRAIN_MODULE_UPDATE == 4:
                list_var_to_update = [var for var in tf.trainable_variables() if 'classification' not in var.name]

            grads_and_vars = self.optimizer.compute_gradients(loss, list_var_to_update)
            capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in grads_and_vars]

            train_op = self.optimizer.apply_gradients(capped_gvs, global_step=global_step)
            self.saver = tf.train.Saver(max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT)
            # Write the train and validation information to tensorboard
            self.writer = tf.summary.FileWriter(self.tbdir, sess.graph)

        return lr, train_op
 def testTMul(self):
   num_training_steps = 1000
   initial_lr = 1.0
   t_mul = 1.0
   for step in range(0, 1500, 250):
     with self.test_session():
       decayed_lr = learning_rate_decay.cosine_decay_restarts(
           initial_lr, step, num_training_steps, t_mul=t_mul)
       expected = self.np_cosine_decay_restarts(step, num_training_steps,
                                                t_mul=t_mul)
       self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
    def construct_graph(self, sess):
        with sess.graph.as_default():

            tf.set_random_seed(cfg.RNG_SEED)

            layers = self.net.create_architecture(True)

            loss = layers['total_loss']

            if cfg.TRAIN_MODULE_CONTINUE == 1:
                path_iter = self.pretrained_model.split('.ckpt')[0]
                iter_num = path_iter.split('_')[-1]
                global_step = tf.Variable(int(iter_num), trainable=False)

            if cfg.TRAIN_MODULE_CONTINUE == 2:
                global_step = tf.Variable(0, trainable=False)

            first_decay_steps = 2 * len(self.Trainval_GT)
            lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10,
                                       global_step,
                                       first_decay_steps,
                                       t_mul=cfg.LR_DECAY.T_MUL,
                                       m_mul=cfg.LR_DECAY.M_MUL,
                                       alpha=0.0)

            self.optimizer = tf.train.GradientDescentOptimizer(lr)

            list_var_to_update = []
            if cfg.TRAIN_MODULE_UPDATE == 1:
                list_var_to_update = tf.trainable_variables()
            elif cfg.TRAIN_MODULE_UPDATE == 2:
                list_var_to_update = [var for var in tf.trainable_variables() if 'Att_sp' in var.name\
                                                                              or 'bottleneck_sp' in var.name\
                                                                              or 'conv1_sp' in var.name\
                                                                              or 'conv2_sp' in var.name\
                                                                              or 'Concat_SHsp' in var.name\
                                                                              or 'fc7_SHsp' in var.name\
                                                                              or 'body_to_head' in var.name\
                                                                              or 'attention_3D' in var.name\
                                                                              or 'triplet_align' in var.name\
                                                                              or 'space_classification' in var.name\
                                                                              or 'joint_classification' in var.name]

            grads_and_vars = self.optimizer.compute_gradients(
                loss, list_var_to_update)
            capped_gvs = [(tf.clip_by_norm(grad, 1.), var)
                          for grad, var in grads_and_vars]

            train_op = self.optimizer.apply_gradients(capped_gvs,
                                                      global_step=global_step)
            self.saver = tf.train.Saver(max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT)
            self.writer = tf.summary.FileWriter(self.tbdir, sess.graph)

        return lr, train_op
Exemplo n.º 7
0
    def get_optimzer_lr(self, global_step, step_factor):
        stepsize = int(cfg.TRAIN.STEPSIZE * step_factor)
        gamma = cfg.TRAIN.GAMMA
        epoch_iters = get_epoch_iters(self.net.model_name)
        stepsize = epoch_iters * 2

        lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10,
                                        global_step,
                                        stepsize,
                                        gamma,
                                        staircase=True)
        optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)

        if self.net.model_name.__contains__('cosine'):
            print('cosine =========')
            first_decay_steps = epoch_iters * 10  # 2 epoches
            from tensorflow.python.training.learning_rate_decay import cosine_decay_restarts
            lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10,
                                       global_step,
                                       first_decay_steps,
                                       t_mul=2.0,
                                       m_mul=0.9,
                                       alpha=cfg.TRAIN.LEARNING_RATE * 0.1)
            optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)
        elif self.net.model_name.__contains__('zsrare'):  #rare first
            lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10,
                                            global_step,
                                            int(cfg.TRAIN.STEPSIZE * 2),
                                            gamma,
                                            staircase=True)
            optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)
        elif self.net.model_name.__contains__('zsnrare'):  # non rare first
            lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10,
                                            global_step,
                                            int(cfg.TRAIN.STEPSIZE *
                                                step_factor),
                                            gamma,
                                            staircase=True)
            optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)
        return lr, optimizer
Exemplo n.º 8
0
    def construct_graph(self, sess):
        with sess.graph.as_default():
            # Set the random seed for tensorflow
            tf.compat.v1.set_random_seed(cfg.RNG_SEED)
            # Build the main computation graph
            layers = self.net.create_architecture(
                True)  # is_training flag: True
            # Define the loss
            loss = layers['total_loss']

            # 获取 global_step
            if cfg.TRAIN_MODULE_CONTINUE == 1:  # from iter_ckpt
                path_iter = self.pretrained_model.split('.ckpt')[0]
                iter_num = path_iter.split('_')[-1]
                global_step = tf.Variable(int(iter_num), trainable=False)
            elif cfg.TRAIN_MODULE_CONTINUE == 2:  # from iter 0
                global_step = tf.Variable(0, trainable=False)

            # 根据 global_step 获取 lr
            # 指数下降的 lr
            # lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10, global_step, cfg.TRAIN.STEPSIZE * 5, cfg.TRAIN.GAMMA, staircase=True)
            # 余弦下降的 lr
            first_decay_steps = 80000  # first_decay_steps 是指第一次完全下降的 step 数
            t_mul, m_mul = 2.0, 1.0  # t_mul 是指每一次循环的步数都将乘以 t_mul 倍, # m_mul 指每一次循环重新开始时的初始 lr 是上一次循环初始值的 m_mul 倍
            lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10,
                                       global_step,
                                       first_decay_steps,
                                       t_mul,
                                       m_mul,
                                       alpha=0.0)

            # 定义优化函数,使用Momentum算法的Optimizer
            self.optimizer = tf.compat.v1.train.MomentumOptimizer(
                lr, cfg.TRAIN.MOMENTUM)

            # 1--Update_all_parameter, 2--Only_Update_D, 3--Update_H+O+SP, 4--updating except classifiers of S(fc)
            list_var_to_update = []
            if cfg.TRAIN_MODULE_UPDATE == 1:
                list_var_to_update = tf.compat.v1.trainable_variables()
            if cfg.TRAIN_MODULE_UPDATE == 2:
                list_var_to_update = [
                    var for var in tf.compat.v1.trainable_variables()
                    if 'fc_binary' in var.name
                    or 'binary_classification' in var.name
                ]
            if cfg.TRAIN_MODULE_UPDATE == 3:
                list_var_to_update = [
                    var for var in tf.compat.v1.trainable_variables()
                    if 'fc_binary' not in var.name
                    or 'binary_classification' not in var.name
                ]
            if cfg.TRAIN_MODULE_UPDATE == 4:
                list_var_to_update = [
                    var for var in tf.compat.v1.trainable_variables()
                    if 'classification' not in var.name
                ]

            # 计算各个train_variable的梯度
            grads_and_vars = self.optimizer.compute_gradients(
                loss, list_var_to_update)
            capped_gvs = [(tf.clip_by_norm(grad, 1.), var)
                          for grad, var in grads_and_vars
                          ]  # 将梯度的L2范数缩放为 1, 防止梯度爆炸
            # 将计算出的梯度应用到变量上
            train_op = self.optimizer.apply_gradients(capped_gvs,
                                                      global_step=global_step)

            # 新建一个 Saver 用于保存模型,这个Saver在snapshot()中用到
            # max_to_keep用来控制检查点的数量,超过max_to_keep后,新来一个checkpoint将删除原有的旧的一个,我们程序里max_to_keep=None
            self.saver = tf.compat.v1.train.Saver(
                max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT)
            # 新建一个 Writer 用于将训练时的summary写入tensorboard
            self.writer = tf.compat.v1.summary.FileWriter(
                self.tbdir, sess.graph)

        return lr, train_op
Exemplo n.º 9
0
def train(args):
    '''start by getting the data_loader object'''
    data_loader = TextLoader(args.reverse, args.data_dir, args.test_split,
                             args.batch_size, args.seq_length,
                             args.input_encoding)
    '''some informative prints'''
    args.vocab_size = data_loader.vocab_size
    print("Train size: ", data_loader.num_batches * args.batch_size)
    if args.test_split > 0:
        print("Test size: ", data_loader.test_num_batches * args.batch_size)
    print("Vocab size: ", args.vocab_size)

    # check compatibility if training is continued from previously saved model
    if args.init_from is not None:
        # check if all necessary files exist
        assert os.path.isdir(
            args.init_from), " %s must be a path" % args.init_from
        assert os.path.isfile(
            os.path.join(args.init_from, "config.pkl")
        ), "config.pkl file does not exist in path %s" % args.init_from
        assert os.path.isfile(
            os.path.join(args.init_from, "words_vocab.pkl")
        ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from
        ckpt = tf.train.get_checkpoint_state(args.init_from)
        assert ckpt, "No checkpoint found"
        assert ckpt.model_checkpoint_path, "No model path found in checkpoint"

        # open old config and check if models are compatible
        with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f:
            saved_model_args = cPickle.load(f)
        need_be_same = ["model", "rnn_size", "num_layers", "seq_length"]
        for checkme in need_be_same:
            assert vars(saved_model_args)[checkme] == vars(
                args
            )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        # open saved vocab/dict and check if vocabs/dicts are compatible
        with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f:
            saved_words, saved_vocab = cPickle.load(f)
        assert saved_words == data_loader.words, "Data and loaded model disagree on word set!"
        assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!"
    '''idk what the pickle.dump does'''
    with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(args, f)
    with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f:
        cPickle.dump((data_loader.words, data_loader.vocab), f)
    '''start up the model'''
    model = Model(args)
    '''if a test split is requested, get it'''
    if args.test_split > 0:
        test_x = data_loader.test_x
        test_y = data_loader.test_y
    '''not sure about this stuff'''
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(args.log_dir)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem)
    '''begin the session for training'''
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        # take a look at the learning rate schedule, if so desired (note parameters here should match the ones used)
        plot = False
        if plot:
            n = args.num_epochs * data_loader.num_batches
            n = 150000
            x = np.arange(n)
            y = np.zeros((n, 1))
            y = cosine_decay_restarts(
                args.learning_rate,
                x,  # shift down every epoch
                50000,  # check out this sweet graph https://github.com/tensorflow/tensorflow/pull/11749
                .9,  # doesn't hurt to look at the tf docs too
                .1,
                1e-12).eval()
            plt.figure()
            plt.plot(x, y)
            plt.title("Learning rate schedule")
            plt.show()
        '''not sure what this does'''
        train_writer.add_graph(sess.graph)
        tf.global_variables_initializer().run()
        saver = tf.train.Saver(tf.global_variables())
        '''fun fact: you cant put comments inside if-else clauses'''
        '''initialize from a previous model OR start from scratch, which means grabbing GloVe embeddings '''
        if args.init_from is not None:
            saver.restore(sess, ckpt.model_checkpoint_path)

        else:
            print("Loading my knowledge of the English language...")
            embeddings = data_loader.get_embeddings()
            sess.run([model.embedding_init],
                     {model.embedding_placeholder: embeddings})
        '''iterate over the range of epochs specified'''
        for e in range(model.epoch_pointer.eval(), args.num_epochs):
            #sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) #this is the vanilla exponential deca
            '''learning rate decay is cosine annealing'''
            sess.run(
                tf.assign(
                    model.lr,
                    cosine_decay_restarts(
                        args.learning_rate,
                        e * data_loader.num_batches,  # shift down every epoch
                        20000,  # check out this sweet graph https://github.com/tensorflow/tensorflow/pull/11749
                        1,  # doesn't hurt to look at the tf docs too
                        .1,
                        1e-12)))
            '''reset the pointer to start from the beginning'''
            data_loader.reset_batch_pointer()
            state = sess.run(model.initial_state)
            speed = 0
            if args.init_from is None:
                assign_op = model.epoch_pointer.assign(e)
                sess.run(assign_op)
            if args.init_from is not None:
                data_loader.pointer = model.batch_pointer.eval()
                args.init_from = None
            '''iterative over the batches in the dataset'''
            for b in range(data_loader.pointer, data_loader.num_batches):
                start = time.time()
                x, y = data_loader.next_batch()
                '''the feed dictionary gets passed to the model when tensorflow variables are computed'''
                feed = {
                    model.input_data: x,
                    model.targets: y,
                    model.initial_state: state,
                    model.batch_time: speed,
                    model.dropout: args.dropout
                }
                '''variables to be trained, either with or without word embeddings'''
                run_list_full = [
                    merged, model.cost, model.final_state, model.train_op,
                    model.inc_batch_pointer_op
                ]
                run_list_no_W = [
                    merged, model.cost, model.final_state, model.train_op_no_W,
                    model.inc_batch_pointer_op
                ]
                # YES, TRAIN THE EMBEDDINGS
                if args.trainable_embeddings == 1:
                    summary, train_loss, state, _, _ = sess.run(
                        run_list_full, feed)
                # NO, DO NOT TRAIN THE EMBEDDINGS (train_op_no_W)
                elif args.trainable_embeddings == 0:
                    summary, train_loss, state, _, _ = sess.run(
                        run_list_no_W, feed)
                # it's been e epochs, so start training the embeddings
                elif e > args.trainable_embeddings:
                    summary, train_loss, state, _, _ = sess.run(
                        run_list_full, feed)
                # it hasn't been e epochs, don't train the embeddings
                else:
                    summary, train_loss, state, _, _ = sess.run(
                        run_list_no_W, feed)
                '''some diagnostics to be printed, and the model gets saved here too'''
                train_writer.add_summary(summary,
                                         e * data_loader.num_batches + b)
                speed = time.time() - start
                if (e * data_loader.num_batches + b) % args.batch_size == 0:
                    print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                        .format(e * data_loader.num_batches + b,
                                args.num_epochs * data_loader.num_batches,
                                e, train_loss, speed))
                if (e * data_loader.num_batches + b) % args.save_every == 0 \
                        or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result
                    checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
                    saver.save(sess,
                               checkpoint_path,
                               global_step=e * data_loader.num_batches + b)
                    print("model saved to {}".format(checkpoint_path))
                    print("learning rate: ", model.lr.eval())

                    #TEST LOSS EVAL - evaluates batch by batch with same batch size as for training
                    if (args.test_split > 0):
                        test_loss = 0
                        batches_in_test = len(test_x)
                        save_state = state
                        state = sess.run(model.initial_state)
                        for i in range(batches_in_test):
                            feed = {
                                model.test_x: test_x[i],
                                model.test_y: test_y[i],
                                model.initial_state: state
                            }
                            loss, state, _ = sess.run([
                                model.test_cost, model.test_final_state,
                                model.inc_batch_pointer_op
                            ], feed)
                            test_loss += loss
                        test_loss = test_loss / batches_in_test
                        state = save_state
                        print("test_loss = {:.3f}".format(test_loss))
        '''one final evaluation of the entire dataset to check the loss'''

        data_loader.reset_batch_pointer()
        state = sess.run(model.initial_state)
        ovr_loss = 0
        start = time.time()
        for b in range(data_loader.pointer, data_loader.num_batches):
            x, y = data_loader.next_batch()
            feed = {
                model.input_data: x,
                model.targets: y,
                model.initial_state: state
            }
            train_loss, state, _ = sess.run(
                [model.cost, model.final_state, model.inc_batch_pointer_op],
                feed)
            ovr_loss += train_loss

        speed = time.time() - start
        print("ovr_train_loss = {:.3f}, time_to_eval = {:.3f}".format(
            ovr_loss / data_loader.num_batches, speed))
        '''lets you initialize a model without training it'''
        if args.num_epochs == 0:
            saver.save(sess,
                       checkpoint_path,
                       global_step=e * data_loader.num_batches + b)
            print("model saved to {}".format(checkpoint_path))

        train_writer.close()