示例#1
0
        logging.info("******Start server******")
        server.join()
    elif job_name == 'worker':
        train_class = TrainClass(conf_dict)
        device = tf.train.replica_device_setter(worker_device='/job:worker/task:%d' % task_index, cluster=cluster)
        train_class.ConstructGraph(device,server)
        #print_trainable_variables(train_class.sess, 'save.model.txt')

        iter = 0
        err_rate = 1.0

        # every five minutes start one job
        wait_time = 60 * 5 * task_index
        time.sleep(wait_time)
        while iter < 15:
            train_start_t = time.time()
            shuffle = False
            if iter > 0:
                shuffle = True
            tmp_tr_err_rate = train_class.TrainLogic(device, shuffle = shuffle, train_loss = True, skip_offset = iter)

            train_end_t = time.time()
            logging.info("******train %d iter time is %f ******" % (iter, train_end_t-train_start_t))
            # write text model
            if task_index == 0:
                print_trainable_variables(train_class.sess, conf_dict["checkpoint_dir"] + '/save.model.txt-' + str(iter))
#            tmp_cv_err_rate = train_class.TrainLogic(device, shuffle = False, train_loss = False, skip_offset = iter)
            iter += 1


示例#2
0
 def SaveTextModel(self):
     if self.print_trainable_variables_cf == True:
         ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir_cf)
         if ckpt and ckpt.model_checkpoint_path:
             print_trainable_variables(self.sess, ckpt.model_checkpoint_path+'.txt')
示例#3
0
    def construct_graph(self):
        with tf.Graph().as_default():
            self.run_ops = []
            #self.X = tf.placeholder(tf.float32, [None, None, self.input_dim], name='feature')
            print(self.nnet_conf.num_frames_batch, self.nnet_conf.batch_size,
                  self.input_dim)
            self.X = tf.placeholder(tf.float32, [
                self.nnet_conf.num_frames_batch, self.nnet_conf.batch_size,
                self.input_dim
            ],
                                    name='feature')
            #self.Y = tf.sparse_placeholder(tf.int32, name="labels")
            self.Y = tf.placeholder(
                tf.int32,
                [self.nnet_conf.batch_size, self.nnet_conf.num_frames_batch],
                name="labels")
            self.seq_len = tf.placeholder(tf.int32, [None], name='seq_len')

            self.learning_rate_var = tf.Variable(float(
                self.nnet_conf.learning_rate),
                                                 trainable=False,
                                                 name='learning_rate')
            if self.use_sgd:
                optimizer = tf.train.GradientDescentOptimizer(
                    self.learning_rate_var)
            else:
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate_var,
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-08)

            for i in range(self.num_threads):
                with tf.device("/gpu:%d" % i):
                    initializer = tf.random_uniform_initializer(
                        -self.nnet_conf.init_scale, self.nnet_conf.init_scale)
                    model = LSTM_Model(self.nnet_conf)
                    mean_loss, ce_loss, rnn_keep_state_op, rnn_state_zero_op, label_error_rate, softval = model.ce_train(
                        self.X, self.Y, self.seq_len)
                    if self.use_sgd and self.use_normal:
                        tvars = tf.trainable_variables()
                        grads, _ = tf.clip_by_global_norm(
                            tf.gradients(mean_loss, tvars),
                            self.nnet_conf.grad_clip)
                        train_op = optimizer.apply_gradients(
                            zip(grads, tvars),
                            global_step=tf.contrib.framework.
                            get_or_create_global_step())
                    else:
                        train_op = optimizer.minimize(mean_loss)

                    run_op = {
                        'train_op': train_op,
                        'mean_loss': mean_loss,
                        'ce_loss': ce_loss,
                        'rnn_keep_state_op': rnn_keep_state_op,
                        'rnn_state_zero_op': rnn_state_zero_op,
                        'label_error_rate': label_error_rate,
                        'softval': softval
                    }
                    self.run_ops.append(run_op)
                    tf.get_variable_scope().reuse_variables()

            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95)
            self.sess = tf.Session(config=tf.ConfigProto(
                intra_op_parallelism_threads=self.num_threads,
                allow_soft_placement=True,
                log_device_placement=False,
                gpu_options=gpu_options))
            init = tf.group(tf.global_variables_initializer(),
                            tf.local_variables_initializer())
            tmp_variables = tf.trainable_variables()
            self.saver = tf.train.Saver(tmp_variables, max_to_keep=100)
            #self.saver = tf.train.Saver(max_to_keep=100)
            if self.restore_training:
                self.sess.run(init)
                ckpt = tf.train.get_checkpoint_state(
                    self.tf_async_model_prefix)
                if ckpt and ckpt.model_checkpoint_path:
                    logging.info("restore training")
                    self.saver.restore(self.sess, ckpt.model_checkpoint_path)
                    self.num_batch_total = self.get_num(
                        ckpt.model_checkpoint_path)
                    if self.print_trainable_variables == True:
                        print_trainable_variables(
                            self.sess, ckpt.model_checkpoint_path + '.txt')
                        sys.exit(0)

                    logging.info('model:' + ckpt.model_checkpoint_path)
                    logging.info('restore learn_rate:' +
                                 str(self.sess.run(self.learning_rate_var)))
                    #print('*******************',self.num_batch_total)
                    #time.sleep(3)
                    #model_48434.ckpt.final
                    #print("ckpt.model_checkpoint_path",ckpt.model_checkpoint_path)
                    #print("self.tf_async_model_prefix",self.tf_async_model_prefix)
                    #self.saver.restore(self.sess, self.tf_async_model_prefix)

                else:
                    logging.info('No checkpoint file found')
                    self.sess.run(init)
                    logging.info('init learn_rate:' +
                                 str(self.sess.run(self.learning_rate_var)))
            else:
                self.sess.run(init)

            self.total_variables = np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.trainable_variables()
            ])
            logging.info('total parameters : %d' % self.total_variables)