示例#1
0
    def train(self):
        LR, HR = self.double_input_producer()
        global_step = tf.Variable(initial_value=0, trainable=False)
        self.global_step = global_step
        self.build()
        lr = tf.train.polynomial_decay(self.learning_rate,
                                       global_step,
                                       self.decay_step,
                                       end_learning_rate=self.end_lr,
                                       power=1.)

        vars_all = tf.trainable_variables()
        print('Params num of all:', get_num_params(vars_all))
        training_op = tf.train.AdamOptimizer(lr).minimize(
            self.loss, var_list=vars_all, global_step=global_step)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        #sess=tf.Session()
        self.sess = sess
        sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(max_to_keep=100,
                                    keep_checkpoint_every_n_hours=1)
        if self.reload:
            self.load(sess, self.save_dir)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        cost_time = 0
        start_time = time.time()
        gs = sess.run(global_step)
        for step in range(sess.run(global_step), self.max_step):
            if step > gs and step % 20 == 0:
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                      'Step:{}, loss:{}'.format(step, loss_v))

            if step % 500 == 0:
                if step > gs:
                    self.save(sess, self.save_dir, step)
                cost_time = time.time() - start_time
                print('cost {}s.'.format(cost_time))
                self.eval()
                cost_time = time.time() - start_time
                start_time = time.time()
                print('cost {}s.'.format(cost_time))

            lr1, hr = sess.run([LR, HR])
            _, loss_v = sess.run([training_op, self.loss],
                                 feed_dict={
                                     self.L: lr1,
                                     self.H: hr,
                                     self.is_train: True
                                 })

            if step > 500 and loss_v > 10:
                print('Model collapsed with loss={}'.format(loss_v))
                break
示例#2
0
文件: test.py 项目: zxlation/FC2N
def evaluate():
    # load test datasets
    datasets = ['Set5', 'Set14', 'B100', 'Urban100', 'Manga109']
    loader.load_test_datasets(datasets, [scale])
    with tf.Graph().as_default():
        model.model_compile(np.array(loader.data_mean), scale)
        param = get_num_params()
        print("======== %s [X%d, param = %d] ========" %
              (model.name, scale, param))

        # main body of evalution
        config = tf.ConfigProto()
        config.log_device_placement = True
        config.allow_soft_placement = True
        with tf.Session(config=config) as sess:
            # restore model from disk
            model_dir = os.path.join(record_dir, model.name, "X%d" % scale,
                                     'train_logs')
            model_dir = os.path.join(model_dir, 'model.ckpt-X%d' % scale)
            model.saver.restore(sess, model_dir)
            for k in list(loader.test_datasets.keys()):
                print("\n%s" % k)
                dataset = loader.test_datasets[k]
                for s in list(dataset.keys()):
                    test_one_dataset(model, dataset[s], s, k)

        print("Done!")
示例#3
0
    def evaluate(self,
                 load_path=general_config.load_path_test,
                 validFile=None,
                 vocab2intPath=None):
        if validFile is None or vocab2intPath is None:
            validFile = general_config.training_file
            vocab2intPath = general_config.global_nonstatic_v2i_path

        train_generator = PaddedDataIterator(loadPath=validFile,
                                             vocab2intPath=vocab2intPath,
                                             sent_len_cut=self.min_len)
        load_dir = load_path if os.path.isdir(load_path) else os.path.dirname(
            load_path)
        log_dir = load_dir.replace("checkpoints", "logs")
        logger = my_logger(log_dir + "/log_evaluate.txt")

        os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.8

        with tf.Session(config=config, graph=self.graph) as sess:
            logger.info("Loading model...")
            saver = tf.train.Saver()
            if os.path.isdir(load_path):
                ckpt = tf.train.get_checkpoint_state(load_path)
                saver.restore(sess, ckpt.model_checkpoint_path)
                global_step = ckpt.model_checkpoint_path.split("-")[-1]
            else:
                saver.restore(sess, load_path)
                global_step = load_path.split("-")[-1]
            logger.info("Loading successfully, loading epoch is %s" %
                        global_step)
            logger.info("The total number of trainable variables: %s" %
                        get_num_params())

            cur_loop = train_generator.loop
            cur_count = 0
            avg_loss_t, avg_accuracy_t = 0., 0.
            _, batch_seqs, batch_labels, batch_lens = train_generator.next(
                1024, need_all=True)
            while (train_generator.loop == cur_loop):
                cur_count += 1
                loss_t, acc_t = sess.run([self.loss_op, self.acc_op],
                                         feed_dict=self._feed_dict_valid(
                                             batch_seqs, batch_labels,
                                             batch_lens))
                avg_loss_t += loss_t
                avg_accuracy_t += acc_t
                _, batch_seqs, batch_labels, batch_lens = train_generator.next(
                    1024, need_all=True)
            avg_loss_t /= cur_count
            avg_accuracy_t /= cur_count
            logger.info("Loss: %.4f, Accuracy: %.4f " %
                        (avg_loss_t, avg_accuracy_t))
        return avg_loss_t, avg_accuracy_t
示例#4
0
    def fit(self,trainFile=None,with_validation=general_config.with_validation,
              log_dir=general_config.log_dir+"/TextCNN",save_dir=general_config.save_dir+"/TextCNN",
              load_path=general_config.load_path_train,
            num_epochs=general_config.num_epochs,
              steps_every_epoch=general_config.steps_every_epoch,
            batch_size=general_config.batch_size,
              learning_rate=general_config.learning_rate,
              lr_changing=general_config.lr_changing,
              min_learning_rate=general_config.min_learning_rate,
            learning_rate_decay=general_config.learning_rate_decay,
              save_epochs=general_config.save_epochs,
            early_stopping=general_config.early_stopping,
            num_visual=general_config.num_visualize):

        self.learning_rate_value = learning_rate

        self.trainFile = trainFile
        self.validFile = None
        self.with_validation = with_validation
        if self.trainFile is None:
            if self.with_validation:
                self.trainFile = general_config.train_file
            else:
                self.trainFile = general_config.training_file
        if self.with_validation:
            self.validFile = self.trainFile.replace("train", "valid")
        tmp = os.path.join(os.path.dirname(self.trainFile),
                           os.path.basename(self.trainFile).replace(".txt", "").split("_")[0])
        if self.model_type in ["static","multichannel"]:
            self.int2vocabPath = general_config.global_static_i2v_path
            self.vocab2intPath = general_config.global_static_v2i_path
        else:
            self.int2vocabPath = tmp + "_i2v.json"
            self.vocab2intPath = tmp + "_v2i.json"
        metadataPath = {
            "static": "/home/leechen/code/python/TextSentimentClassification/data_helpers/dataset/training_testing_metadata.tsv"}
        metadataPath["nonstatic"] = "/home/leechen/code/python/TextSentimentClassification/" \
                                    + self.vocab2intPath.replace("v2i.json", "metadata.tsv")
        train_loss = []
        train_accuracy = []
        valid_loss = []
        valid_accuracy = []
        # 训练过程中的日志保存文件以及模型保存路径
        if self.with_validation:
            log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train_valid")
            train_dir = os.path.join(log_dir, "train")
            val_dir = os.path.join(log_dir, "valid")
            save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train_valid")
        else:
            log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train")
            train_dir = os.path.join(log_dir, "train")
            val_dir=None
            save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train")

        # 生成日志
        logger=my_logger(log_dir+"/log_fit.txt")
        msg = "\n--filter_size_list: %s\n" % self.filter_size_list \
              + "--filter_num: %s\n" % self.filter_num \
              + "--fc_layer_size_list: %s\n" % self.fc_layer_size_list \
              + "--embedding_size: %s\n" % self.embedding_size \
              + "--dropout: %s\n" % self.dropout_value \
              + "--max_l2_norm: %s\n" % self.max_l2_norm \
              + "--learning_rate: %s\n" % self.learning_rate_value \
              + "--lr_changing: %s\n" % lr_changing \
              + "--min_learning_rate: %s\n" % min_learning_rate\
              + "--learning_rate_decay: %s\n" % learning_rate_decay\
              +"--load_path: %s\n" % load_path\
              +"--num_epochs: %s\n" % num_epochs\
              +"--steps_every_epoch: %s\n" % steps_every_epoch\
              +"--batch_size: %s\n" % batch_size\
              +"--save_epochs: %s\n" % save_epochs\
              +"--early_stopping: %s\n" % early_stopping\
              +"--num_visual: %s\n"%num_visual
        logger.info(msg)

        # 定义数据生成器
        train_generator = PaddedDataIterator(loadPath=self.trainFile,vocab2intPath=self.vocab2intPath)
        val_generator = None if self.validFile is None else PaddedDataIterator(loadPath=self.validFile,
                                                                               vocab2intPath=self.vocab2intPath)

        os.environ["CUDA_VISIBLE_DEVICES"] = str(0)
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.8

        with tf.Session(config=config,graph=self.graph) as sess:
            train_writer = tf.summary.FileWriter(train_dir, sess.graph)
            val_writer = None if val_dir is None else tf.summary.FileWriter(val_dir)
            saver = tf.train.Saver(max_to_keep=5)
            sess.run(tf.global_variables_initializer())
            start = 0
            if isinstance(load_path,str):
                if os.path.isdir(load_path):
                    ckpt = tf.train.get_checkpoint_state(load_path)
                    saver.restore(sess, ckpt.model_checkpoint_path)
                    start = ckpt.model_checkpoint_path.split("-")[-1]
                else:
                    saver.restore(sess, load_path)
                    start = load_path.split("-")[-1]
                logger.info("Loading successfully, loading epoch is %s" % start)
            logger.info("The total number of trainable variables: %s"%get_num_params())
            cur_early_stopping=0
            cur_max_acc=0.
            
            logger.info('******* start training with %d *******' % start)
            epoch=0
            for epoch in range(start, num_epochs):
                if lr_changing:
                    try:
                        if (train_loss[-1]>train_loss[-2]):
                            tmp=self.learning_rate_value*learning_rate_decay
                            if (tmp>=min_learning_rate):
                                self.learning_rate_value=tmp
                                logger.info("Learning rate multiplied by %s at epoch %s."
                                            %(learning_rate_decay,epoch+1))
                        else:
                            if (train_loss[-1]<train_loss[-2]-0.015):
                                self.learning_rate_value*=1.05
                                logger.info("Learning rate multiplied by 1.05 at epoch %s."%(epoch+1))
                    except:
                        pass
                avg_loss_t, avg_accuracy_t = 0, 0
                avg_loss_v, avg_accuracy_v = 0, 0
                for step in range(steps_every_epoch):
                    _, batch_seqs, batch_labels, _ = train_generator.next(batch_size)
                    batch_seqs_ns=None
                    if self.model_type=="multichannel":
                        batch_seqs_ns = self._X2X_ns(batch_seqs)
                    sess.run(self.train_op,self._feed_dict_train(batch_x=batch_seqs, batch_y=batch_labels,
                                                                 batch_x_ns=batch_seqs_ns))
                    loss_t, acc_t= sess.run([self.loss_op, self.acc_op],
                                            self._feed_dict_valid(batch_x=batch_seqs,
                                                                batch_y=batch_labels,
                                                                 batch_x_ns=batch_seqs_ns))
                    avg_loss_t += loss_t
                    avg_accuracy_t += acc_t
                avg_loss_t/=steps_every_epoch
                avg_accuracy_t/=steps_every_epoch
                train_loss.append(avg_loss_t)
                train_accuracy.append(avg_accuracy_t)
                self.loss_accuracy_summary.value[0].simple_value = avg_loss_t
                self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_t
                train_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1)
                if self.with_validation:
                   # 计算验证集上的表现
                    cur_loop=val_generator.loop
                    _, batch_seqs, batch_labels,_ = val_generator.next(1024,need_all=True)
                    batch_seqs_ns=None
                    if self.model_type == "multichannel":
                        batch_seqs_ns = self._X2X_ns(batch_seqs)
                    cur_count=0
                    while(val_generator.loop==cur_loop):
                        loss_v, acc_v = sess.run([self.loss_op, self.acc_op],
                                                 feed_dict= self._feed_dict_valid(batch_x=batch_seqs,
                                                batch_y=batch_labels,batch_x_ns=batch_seqs_ns))
                        avg_loss_v += loss_v
                        avg_accuracy_v += acc_v
                        cur_count += 1
                        _, batch_seqs, batch_labels, _ = val_generator.next(1024, need_all=True)
                        batch_seqs_ns = None
                        if self.model_type == "multichannel":
                            batch_seqs_ns = self._X2X_ns(batch_seqs)
                    avg_loss_v/=cur_count
                    avg_accuracy_v/=cur_count
                    valid_loss.append(avg_loss_v)
                    valid_accuracy.append(avg_accuracy_v)
                    self.loss_accuracy_summary.value[0].simple_value = avg_loss_v
                    self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_v
                    val_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1)
                    logger.info("Epoch: [%04d/%04d], "
                          "Training Loss: %.4f, Training Accuracy: %.4f, "
                          "Validation Loss: %.4f, Validation Accuracy: %.4f" \
                          % (epoch + 1, num_epochs,
                             avg_loss_t, avg_accuracy_t, avg_loss_v, avg_accuracy_v))

                    # 如果验证集上的准确率连续低于历史最高准确率的次数超过early_stopping次,则提前停止迭代。
                    if (avg_accuracy_v > cur_max_acc):
                        cur_max_acc = avg_accuracy_v
                        cur_early_stopping = 0
                        logger.info("Saving model-%s" % (epoch + 1))
                        saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1)
                    else:
                        cur_early_stopping += 1
                    if cur_early_stopping > early_stopping:
                        logger.info("Early stopping after epoch %s !" % (epoch + 1))
                        break
                else:
                    logger.info("Epoch: [%04d/%04d], "
                                "Training Loss: %.4f, Training Accuracy: %.4f " \
                                % (epoch + 1, num_epochs,avg_loss_t, avg_accuracy_t))
                # 保存一次模型
                if (epoch - start + 1) % save_epochs == 0:
                    logger.info("Saving model-%s"%(epoch+1))
                    saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1)

            if num_visual > 0:
                # 可视化最终词向量
                config = projector.ProjectorConfig()
                final_embeddings = {}
                try:
                    final_embeddings["static"] = self.embedding_matrix_s.eval()[:num_visual]
                except:
                    pass
                try:
                    final_embeddings["nonstatic"] = self.embedding_matrix_ns.eval()[:num_visual]
                except:
                    pass
                for (name, final_embedding) in final_embeddings.items():
                    embedding_var = tf.Variable(final_embedding, name="word_embeddings_" + name)
                    sess.run(embedding_var.initializer)
                    saver = tf.train.Saver([embedding_var])
                    saver.save(sess, log_dir + "/embeddings_" + name+".ckpt-"+str(epoch+1))
                    embedding = config.embeddings.add()
                    embedding.tensor_name = embedding_var.name
                    embedding.metadata_path = metadataPath[name]
                projector.visualize_embeddings(train_writer, config)
        return train_loss, train_accuracy, valid_loss, valid_accuracy
示例#5
0
文件: train.py 项目: zxlation/FC2N
def train(scale):
    model.model_compile( np.array(loader.data_mean), scale)
    model_params = get_num_params()
    
    # prepare data
    loader.load_train_dataset()
    loader.load_valid_dataset()
    loader.load_batch()
    
    max_saver = tf.train.Saver(max_to_keep = 2, allow_empty = True)
    
    config = tf.ConfigProto()
    config.log_device_placement = True
    config.allow_soft_placement = True
    with tf.Session(config = config) as sess: 
        # defining summary writer
        summary_writer = tf.summary.FileWriter(train_log_dir, sess.graph)
        
        # retrain the existed models
        init_step = 0
        if train_from_exist:
            fmtstr = "restoring model from %s..." % exist_model_dir
            print(colored(fmtstr, "green", attrs = ["bold"]))
            init_step = model.restore_model(exist_model_dir, model.global_steps)
        else:
            fmtstr = "initializing variables..."
            print(colored(fmtstr, "green", attrs = ["bold"]))
            sess.run(tf.global_variables_initializer())
        
        max_psnr = 0
        cur_psnr = 0
        print(colored("starting to train...", 'green', attrs = ['bold']))
        for step in range(init_step, max_steps):
            # To check the time of data preprocessing, extracting batches is also
            # included here.
            start_time = time.time()
            lr_batch, hr_batch, scale = loader.work_queue.get()
            model.train_batch(lr_batch, scale, hr_batch)
            duration = time.time() - start_time
            
            if step == 0 or ((step + 1) % 1000 == 0):
                # valid model using Set5
                formatstr = "%s: [%s (%d)]" % (datetime.now(), model.name, model_params)
                print(colored(formatstr, 'green', attrs = ['bold']))
                
                examples_per_sec = loader.batch_size/(duration + 1e-10)
                formatstr = 'step %d: %.4f images/sec' % (step + 1, examples_per_sec)
                print(colored(formatstr, 'blue', attrs = ['bold']))
                cur_psnr = valid_one_scale(model, loader.valid_dataset, scale, step + 1)  
            
            if (step + 1) % 200 == 0:
                model.feed_dict[model.inputs] = lr_batch
                model.feed_dict[model.scale]  = scale
                model.feed_dict[model.labels] = hr_batch
                summary_str = sess.run(model.summary_ops, feed_dict = model.feed_dict)
                summary_writer.add_summary(summary_str, step + 1)
                    
            if (step + 1) % 500 == 0:
                checkpoint_path = os.path.join(train_log_dir, 'model.ckpt')
                print("saving checkpoint into: %s-%d" % (checkpoint_path, step + 1))
                model.saver.save(sess, checkpoint_path, global_step = step + 1)
            
            if ((step + 1) % 500 == 0) and (cur_psnr > max_psnr):
                max_psnr = cur_psnr
                checkpoint_path = os.path.join(max_log_dir, 'model.ckpt')
                print("saving checkpoint into: %s-%d" % (checkpoint_path, step + 1))
                max_saver.save(sess, checkpoint_path, global_step = step + 1)
            
        summary_writer.close()
示例#6
0
    def train(self):
        """Train video sr network"""
        global_step = tf.Variable(initial_value=0, trainable=False)
        self.global_step = global_step

        # Create folder for logs
        if not tf.gfile.Exists(self.save_dir):
            tf.gfile.MakeDirs(self.save_dir)

        self.build_model()
        lr = tf.train.polynomial_decay(self.learning_rate,
                                       global_step,
                                       self.decay_step,
                                       end_learning_rate=self.end_lr,
                                       power=1.)
        tf.summary.scalar('learning_rate', lr)
        vars_all = tf.trainable_variables()
        vars_sr = [v for v in vars_all if 'srmodel' in v.name]
        vars_flow = [v for v in vars_all if 'flow' in v.name]
        train_all = tf.train.AdamOptimizer(lr).minimize(
            self.loss, var_list=vars_all, global_step=global_step)
        train_flow = tf.train.AdamOptimizer(lr).minimize(
            self.loss_flow, var_list=vars_flow, global_step=global_step)
        train_sr = tf.train.AdamOptimizer(lr).minimize(self.loss_mse,
                                                       var_list=vars_sr,
                                                       global_step=global_step)

        print('params num of flow:', get_num_params(vars_flow))
        print('params num of sr:', get_num_params(vars_sr))
        print('params num of all:', get_num_params(vars_all))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        #sess=tf.Session()
        self.sess = sess
        sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(max_to_keep=100,
                                    keep_checkpoint_every_n_hours=1)
        if self.reload:
            self.load(sess, self.save_dir)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        cost_time = 0
        start_time = time.time()
        gs = sess.run(global_step)
        for step in range(sess.run(global_step), self.max_step):
            if step < 10000:
                train_op = train_sr
            else:
                train_op = train_all

            if step > gs and step % 20 == 0:
                print(
                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                    'Step:{}, loss:({:.3f},{:.3f},{:.3f}), mse:{}'.format(
                        step, loss_value, loss_mse_value,
                        loss_flow_value * 100, str(mse_value)))

            if step % 500 == 0:
                if step > gs:
                    self.save(sess, self.save_dir, step)
                cost_time = time.time() - start_time
                print('cost {}s.'.format(cost_time))
                self.evaluation()
                cost_time = time.time() - start_time
                start_time = time.time()
                print('cost {}s.'.format(cost_time))

            _, loss_value, mse_value, loss_mse_value, loss_flow_value = sess.run(
                [train_op, self.loss, self.mse, self.loss_mse, self.loss_flow])
            # print (loss_value)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
示例#7
0
    def __init__(self, parameters):
        self.num_frames = parameters.num_frames
        self.scale = parameters.scale
        self.in_size = parameters.in_size
        self.gt_size = self.in_size * self.scale
        self.batch_size = parameters.batch_size
        self.learning_rate = parameters.learning_rate
        self.end_lr = parameters.end_lr
        self.reload = parameters.reload
        self.max_step = parameters.max_step
        self.decay_step = parameters.decay_step
        self.train_dir = parameters.train_dir
        self.eval_dir = parameters.eval_dir
        self.save_dir = parameters.save_dir
        self.log_dir = parameters.log_dir
        self.tensorboard_dir = parameters.tensorboard_dir

        self.main_channel_nums = parameters.main_channel_nums
        self.save_iter_gap = parameters.save_iter_gap
        self.start_epoch = parameters.start_epoch

        # build the main network computational graph
        # # the main SR network: EDVR or PFNL
        self.model = EDVR_Core(nf=self.main_channel_nums,
                               nframes=self.num_frames)

        self.GT = tf.placeholder(tf.float32,
                                 shape=[None, 1, None, None, 3],
                                 name='H_truth')
        self.L_train = tf.placeholder(tf.float32,
                                      shape=[
                                          self.batch_size, self.num_frames,
                                          self.in_size, self.in_size, 3
                                      ],
                                      name='L_train')
        self.SR = self.forward(self.L_train)

        self.L_test = tf.placeholder(tf.float32,
                                     shape=[1, self.num_frames, None, None, 3],
                                     name='L_test')
        self.SR_test = self.forward(self.L_test)

        self.loss = tf.reduce_mean(tf.sqrt((self.SR - self.GT)**2 + 1e-6))

        # data loader and training supports
        self.LR_one_batch, self.HR_one_batch = self.double_input_producer()
        global_step = tf.Variable(initial_value=0, trainable=False)
        self.global_step = global_step

        lr = tf.train.polynomial_decay(self.learning_rate,
                                       global_step,
                                       self.decay_step,
                                       end_learning_rate=self.end_lr,
                                       power=1.)
        vars_all = tf.trainable_variables()
        print('Params num of all:', get_num_params(vars_all))
        self.training_op = tf.train.AdamOptimizer(lr).minimize(
            self.loss, var_list=vars_all, global_step=global_step)

        # For tensorboard visualization

        # used in eval func
        self.loss_epoch = tf.placeholder(tf.float32,
                                         shape=[],
                                         name='epoch_loss_placeholder')
        self.epoch_loss_summary_op = tf.summary.scalar('loss/epoch loss',
                                                       self.loss_epoch)
        self.psnr_eval = tf.placeholder(tf.float32,
                                        shape=[],
                                        name='eval_psnr_placeholder')
        self.eval_psnr_summary_op = tf.summary.scalar('metrics/eval psnr',
                                                      self.psnr_eval)
        self.ssim_eval = tf.placeholder(tf.float32,
                                        shape=[],
                                        name='eval_ssim_placeholder')
        self.eval_ssim_summary_op = tf.summary.scalar('metrics/eval ssim',
                                                      self.ssim_eval)
        self.merge_op_eval = tf.summary.merge([
            self.epoch_loss_summary_op, self.eval_psnr_summary_op,
            self.eval_ssim_summary_op
        ])

        # used in iter training func
        iter_loss_summary_op = tf.summary.scalar("loss/iter loss", self.loss)
        lr_summary_op = tf.summary.scalar("lr", lr)
        self.merge_op_training = tf.summary.merge(
            [iter_loss_summary_op, lr_summary_op])

        # writer, get session and hold it and some configs
        self.writer = tf.summary.FileWriter(self.tensorboard_dir,
                                            tf.get_default_graph())

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        print('[**] Initialzing global varibles ...')
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(max_to_keep=50,
                                    keep_checkpoint_every_n_hours=1)
        if self.reload:
            print('[**] loading checkpoint in dir:' + self.save_dir)
            self.load(self.sess, self.save_dir)

        # eval file prepare
        self.eval_frame_data_HR = []
        self.eval_frame_data_LR = []
        pathlists = open(self.eval_dir, 'rt').read().splitlines()
        for dataPath in pathlists:
            inList = sorted(
                glob.glob(
                    os.path.join('H:/AI4K/data/frame_data/validation/LR',
                                 dataPath, '*.png')))
            gtList = sorted(
                glob.glob(
                    os.path.join('H:/AI4K/data/frame_data/validation/HR',
                                 dataPath, '*.png')))
            assert (len(inList) == len(gtList))
            self.eval_frame_data_HR.append(gtList)
            self.eval_frame_data_LR.append(inList)
示例#8
0
def train_model(model_class,
                run_func,
                args,
                quiet=False,
                splits=None,
                abs_output_dir=False):
    output_dir = args.output_dir

    val_stat = args.val_stat
    # Keeps track of certain stats for all the data splits
    all_stats = {
        'val_%s' % val_stat: [],
        'test_%s' % val_stat: [],
        'best_epoch': [],
        'train_last': [],
        'train_best': [],
        'nce': [],
    }

    # Iterate over splits
    splits_iter = splits if splits is not None else range(args.n_splits)
    # Iterates through each split of the data
    for split_idx in splits_iter:
        # print('Training split idx: %d' % split_idx)

        # Creates the output directory for the run of the current split
        if not abs_output_dir:
            args.output_dir = output_dir + '/run_%d' % split_idx
        args.model_dir = args.output_dir + '/models'
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        if not os.path.exists(args.model_dir):
            os.makedirs(args.model_dir)
        write_args(args)

        # Create model and optimizer
        model = model_class(args)
        model.to(args.device)

        if args.separate_lr:
            optim = model.get_model_optim()
        else:
            optim = torch.optim.Adam(model.parameters(), lr=args.lr)

        if split_idx == 0:
            # Print the number of parameters
            num_params = get_num_params(model)
            if not quiet:
                print('Initialized model with %d params' % num_params)

        # Load the train, val, test data
        dataset_loaders = {}
        for data_type in ['train', 'val', 'test']:
            dataset_loaders[data_type] = get_loader(
                args.data_dir,
                data_type=data_type,
                batch_size=args.batch_size,
                shuffle=data_type == 'train',
                split=split_idx,
                n_labels=args.n_labels)

        # Keeps track of stats across all the epochs
        train_m, val_m = StatsManager(), StatsManager()

        # Tensorboard logging, only for the first run split
        if args.log_tb and split_idx == 0:
            log_dir = output_dir + '/logs'
            tb_writer = SummaryWriter(log_dir, max_queue=1, flush_secs=60)
            log_tensorboard(tb_writer, {'params': num_params}, '', 0)
        else:
            args.log_tb = False

        # Training loop
        args.latest_train_stat = 0
        args.latest_val_stat = 0  # Keeps track of the latest relevant stat
        patience_idx = 0
        for epoch_idx in range(args.n_epochs):
            args.epoch = epoch_idx
            train_stats = run_func(model=model,
                                   optim=optim,
                                   data_loader=dataset_loaders['train'],
                                   data_type='train',
                                   args=args,
                                   write_path=None,
                                   quiet=quiet)
            should_write = epoch_idx % args.write_every == 0
            val_stats = run_func(
                model=model,
                optim=None,
                data_loader=dataset_loaders['val'],
                data_type='val',
                args=args,
                write_path='%s/val_output_%d.jsonl' %
                (args.output_dir, epoch_idx) if should_write else None,
                quiet=quiet)

            if not quiet:
                train_stats.print_stats('Train %d: ' % epoch_idx)
                val_stats.print_stats('Val   %d: ' % epoch_idx)

            if args.log_tb:
                log_tensorboard(tb_writer, train_stats.get_stats(), 'train',
                                epoch_idx)
                log_tensorboard(tb_writer, val_stats.get_stats(), 'val',
                                epoch_idx)

            train_stats.add_stat('epoch', epoch_idx)
            val_stats.add_stat('epoch', epoch_idx)

            train_m.add_stats(train_stats.get_stats())
            val_m.add_stats(val_stats.get_stats())

            if val_stats.get_stats()[val_stat] == min(val_m.stats[val_stat]):
                save_model(model,
                           args,
                           args.model_dir,
                           epoch_idx,
                           should_print=not quiet)
                patience_idx = 0
            else:
                patience_idx += 1
                if args.patience != -1 and patience_idx >= args.patience:
                    print(
                        'Validation error has not improved in %d, stopping at epoch: %d'
                        % (args.patience, args.epoch))
                    break

            # Keep track of the latest epoch stats
            args.latest_train_stat = train_stats.get_stats()[val_stat]
            args.latest_val_stat = val_stats.get_stats()[val_stat]

        # Load and save the best model
        best_epoch = val_m.get_best_epoch_for_stat(args.val_stat)
        best_model_path = '%s/model_%d' % (args.model_dir, best_epoch)
        model, _ = load_model(best_model_path,
                              model_class=model_class,
                              device=args.device)
        if not quiet:
            print('Loading model from %s' % best_model_path)

        save_model(model, args, args.model_dir, 'best', should_print=not quiet)

        # Test model
        test_stats = run_func(model=model,
                              optim=None,
                              data_loader=dataset_loaders['test'],
                              data_type='test',
                              args=args,
                              write_path='%s/test_output.jsonl' %
                              args.output_dir,
                              quiet=quiet)
        if not quiet:
            test_stats.print_stats('Test: ')

        if args.log_tb:
            log_tensorboard(tb_writer, test_stats.get_stats(), 'test', 0)
            tb_writer.close()

        # Write test output to a summary file
        with open('%s/summary.txt' % args.output_dir, 'w+') as summary_file:
            for k, v in test_stats.get_stats().items():
                summary_file.write('%s: %.3f\n' % (k, v))

        # Aggregate relevant stats
        all_stats['val_%s' % val_stat].append(min(val_m.stats[val_stat]))
        all_stats['test_%s' % val_stat].append(
            test_stats.get_stats()[val_stat])
        all_stats['best_epoch'].append(best_epoch)

        all_stats['train_last'].append(train_m.stats[val_stat][-1])
        all_stats['train_best'].append(train_m.stats[val_stat][best_epoch])

        if args.nce_coef > 0:
            all_stats['nce'].append(train_m.stats['nce_reg'][best_epoch])

    # Write the stats aggregated across all splits
    with open('%s/summary.txt' % (output_dir), 'w+') as summary_file:
        summary_file.write('Num epochs trained: %d\n' % args.epoch)
        for name, stats_arr in all_stats.items():
            if stats_arr == []:
                continue
            stats_arr = np.array(stats_arr)
            stats_mean = np.mean(stats_arr)
            stats_std = np.std(stats_arr)
            summary_file.write('%s: %s, mean: %.3f, std: %.3f\n' %
                               (name, str(stats_arr), stats_mean, stats_std))

    all_val_stats = np.array(all_stats['val_%s' % val_stat])
    all_test_stats = np.array(all_stats['test_%s' % val_stat])

    val_mean, val_std = np.mean(all_val_stats), np.std(all_val_stats)
    test_mean, test_std = np.mean(all_test_stats), np.std(all_val_stats)

    train_last = np.mean(np.array(all_stats['train_last']))
    train_best = np.mean(np.array(all_stats['train_best']))

    if args.nce_coef > 0:
        nce_loss = np.mean(np.array(all_stats['nce']))
    else:
        nce_loss = 0

    # Return stats
    return (val_mean, val_std), (test_mean, test_std), (train_last,
                                                        train_best), nce_loss
示例#9
0
文件: drvsr.py 项目: saiku-20/History
    def train(self):
        def train_op_func(loss, var_list, is_gradient_clip=False):
            if is_gradient_clip:
                train_op = tf.train.AdamOptimizer(lr, self.beta1)
                grads_and_vars = train_op.compute_gradients(loss,
                                                            var_list=var_list)
                unchanged_gvs = [(grad, var) for grad, var in grads_and_vars
                                 if not 'LSTM' in var.name]
                rnn_grad = [
                    grad for grad, var in grads_and_vars if 'LSTM' in var.name
                ]
                rnn_var = [
                    var for grad, var in grads_and_vars if 'LSTM' in var.name
                ]
                capped_grad, _ = tf.clip_by_global_norm(rnn_grad, clip_norm=3)
                capped_gvs = list(zip(capped_grad, rnn_var))
                train_op = train_op.apply_gradients(grads_and_vars=capped_gvs +
                                                    unchanged_gvs,
                                                    global_step=global_step)
            else:
                # train_op = tf.train.GradientDescentOptimizer(lr).minimize(loss, var_list=var_list, global_step=global_step)
                train_op = tf.train.AdamOptimizer(lr).minimize(
                    loss, var_list=var_list, global_step=global_step)
            return train_op

        """Train video sr network"""
        global_step = tf.Variable(initial_value=0, trainable=False)
        self.global_step = global_step

        # Create folder for logs
        if not tf.gfile.Exists(self.save_dir):
            tf.gfile.MakeDirs(self.save_dir)

        self.build_model()
        lr = tf.train.polynomial_decay(self.learning_rate,
                                       global_step,
                                       self.decay_step,
                                       end_learning_rate=self.end_lr,
                                       power=0.9)
        # tf.summary.scalar('learning_rate', lr)
        vars_all = tf.trainable_variables()
        vars_sr = [v for v in vars_all if 'srmodel' in v.name]
        vars_flow = [v for v in vars_all if 'flow' in v.name]
        train_all = train_op_func(self.loss, vars_all, is_gradient_clip=True)
        train_flow = train_op_func(self.loss_flow,
                                   vars_flow,
                                   is_gradient_clip=True)
        train_sr = train_op_func(self.loss_mse, vars_sr, is_gradient_clip=True)

        print('params num of flow:', get_num_params(vars_flow))
        print('params num of sr:', get_num_params(vars_sr))
        print('params num of all:', get_num_params(vars_all))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        #sess=tf.Session()
        self.sess = sess
        sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(max_to_keep=100,
                                    keep_checkpoint_every_n_hours=1)
        #self.flownets.load_easyflow(sess, os.path.join('./easyflow_log/model1', 'checkpoints'))
        if self.reload:
            self.load(sess, self.save_dir)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # summary_op = tf.summary.merge_all()
        # summary_writer = tf.summary.FileWriter(self.train_dir, sess.graph, flush_secs=30)

        cost_time = 0
        start_time = time.time()
        gs = sess.run(global_step)
        for step in range(sess.run(global_step), self.max_steps):
            if step < 10000:
                train_op = train_sr  #train_flow
            else:
                train_op = train_all

            if step > gs and step % 20 == 0:
                print(
                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                    'Step:{}, loss:({:.3f},{:.3f},{:.3f}), mse:{}'.format(
                        step, loss_value, loss_mse_value,
                        loss_flow_value * 100, str(mse_value)))

            # if step % 50 == 0:
            #     # summary_str = sess.run(summary_op, feed_dict={inputs:batch_input, gt:batch_gt})
            #     summary_str = sess.run(summary_op)
            #     summary_writer.add_summary(summary_str, global_step=step)

            if step % 500 == 0:
                if step > gs:
                    self.save(sess, self.save_dir, step)
                cost_time = time.time() - start_time
                print('cost {}s.'.format(cost_time))
                self.evaluation()
                cost_time = time.time() - start_time
                start_time = time.time()
                print('cost {}s.'.format(cost_time))

            _, loss_value, mse_value, loss_mse_value, loss_flow_value = sess.run(
                [train_op, self.loss, self.mse, self.loss_mse, self.loss_flow])
            # print (loss_value)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
示例#10
0
    def train(self):
        
        # LR, HR= self.single_input_producer()
        LR, HR= self.double_input_producer()
        global_step=tf.Variable(initial_value=0, trainable=False)
        self.global_step=global_step
        self.build()

        lr= tf.train.polynomial_decay(self.learning_rate, global_step, self.decay_step, end_learning_rate=self.end_lr, power=1.)
        vars_all=tf.trainable_variables()
        print('Params num of all:',get_num_params(vars_all))
        training_op = tf.train.AdamOptimizer(lr).minimize(self.loss, var_list=vars_all, global_step=global_step)
        
        # For tensorboard visualization
        writer = tf.summary.FileWriter(self.tensorboard_dir, tf.get_default_graph())
        tf.summary.scalar("loss", self.loss)
        merge_op = tf.summary.merge_all()

        config = tf.ConfigProto() 
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config) 
    
        self.sess=sess
        sess.run(tf.global_variables_initializer())
        
        self.saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=1)
        if self.reload:
            self.load(sess, self.save_dir)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        cost_time=0
        start_time=time.time()
        gs=sess.run(global_step)
        for step in range(sess.run(global_step), self.max_step):
            if step>gs: #and step%20==0:
                print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),'Step:{}, loss:{}'.format(step,loss_v))
                
            # eval and save model
            if step % self.save_iter_gap == 0:
                if step>gs:
                    print('saving model at global step: '+ str(step))
                    self.save(sess, self.save_dir, step)
                cost_time=time.time()-start_time
                print('cost {}s.'.format(cost_time))
                # self.eval()
                cost_time=time.time()-start_time
                start_time=time.time()
                print('cost {}s.'.format(cost_time))

            lr1,hr=sess.run([LR,HR])
            _,loss_v,ss=sess.run([training_op, self.loss, merge_op],feed_dict={self.L:lr1, self.H:hr})
            
            writer.add_summary(ss, step)

            if step>500 and loss_v>10:
                print('Model collapsed with loss={}'.format(loss_v))
                break
        
        writer.close()
示例#11
0
文件: main.py 项目: arpit9295/ce7455
# Get best val score index of each model
best_val_cnn_1_index = val_cnn_1.index(max(val_cnn_1))
best_val_lstm_1_index = val_lstm_1.index(max(val_lstm_1))

# Get test score w.r.t best validation score of each model
test_cnn_1 = all_F_1_CNN[best_val_cnn_1_index][2]
test_lstm_1 = all_F_1_LSTM[best_val_lstm_1_index][2]

# Get best val score of each mode
val_cnn_1 = max(val_cnn_1)
val_lstm_1 = max(val_lstm_1)

# Construct results table
df = pd.DataFrame()
df['model'] = ['cnn_cnn', 'lstm_cnn']
df['parameters'] = [get_num_params(cnn_cnn), get_num_params(lstm_cnn)]
df['val'] = [val_cnn_1, val_lstm_1]
df['test'] = [test_cnn_1, test_lstm_1]

print(
    'CNN char-level encoder vs LSTM char-level encoder (both using Single CNN word-level encoder)\n'
)
print(df)

# #### 2. `Single-layer CNN word-level encoder` vs `Multi-layer CNN word-level encoder` (all using `CNN char-level encoder`)

# Get all val scores of each model
val_cnn_1 = [f1[1] for f1 in all_F_1_CNN]
val_cnn_2 = [f1[1] for f1 in all_F_2_CNN]
val_cnn_3 = [f1[1] for f1 in all_F_3_CNN]