logging.info("******Start server******") server.join() elif job_name == 'worker': train_class = TrainClass(conf_dict) device = tf.train.replica_device_setter(worker_device='/job:worker/task:%d' % task_index, cluster=cluster) train_class.ConstructGraph(device,server) #print_trainable_variables(train_class.sess, 'save.model.txt') iter = 0 err_rate = 1.0 # every five minutes start one job wait_time = 60 * 5 * task_index time.sleep(wait_time) while iter < 15: train_start_t = time.time() shuffle = False if iter > 0: shuffle = True tmp_tr_err_rate = train_class.TrainLogic(device, shuffle = shuffle, train_loss = True, skip_offset = iter) train_end_t = time.time() logging.info("******train %d iter time is %f ******" % (iter, train_end_t-train_start_t)) # write text model if task_index == 0: print_trainable_variables(train_class.sess, conf_dict["checkpoint_dir"] + '/save.model.txt-' + str(iter)) # tmp_cv_err_rate = train_class.TrainLogic(device, shuffle = False, train_loss = False, skip_offset = iter) iter += 1
def SaveTextModel(self): if self.print_trainable_variables_cf == True: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir_cf) if ckpt and ckpt.model_checkpoint_path: print_trainable_variables(self.sess, ckpt.model_checkpoint_path+'.txt')
def construct_graph(self): with tf.Graph().as_default(): self.run_ops = [] #self.X = tf.placeholder(tf.float32, [None, None, self.input_dim], name='feature') print(self.nnet_conf.num_frames_batch, self.nnet_conf.batch_size, self.input_dim) self.X = tf.placeholder(tf.float32, [ self.nnet_conf.num_frames_batch, self.nnet_conf.batch_size, self.input_dim ], name='feature') #self.Y = tf.sparse_placeholder(tf.int32, name="labels") self.Y = tf.placeholder( tf.int32, [self.nnet_conf.batch_size, self.nnet_conf.num_frames_batch], name="labels") self.seq_len = tf.placeholder(tf.int32, [None], name='seq_len') self.learning_rate_var = tf.Variable(float( self.nnet_conf.learning_rate), trainable=False, name='learning_rate') if self.use_sgd: optimizer = tf.train.GradientDescentOptimizer( self.learning_rate_var) else: optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_var, beta1=0.9, beta2=0.999, epsilon=1e-08) for i in range(self.num_threads): with tf.device("/gpu:%d" % i): initializer = tf.random_uniform_initializer( -self.nnet_conf.init_scale, self.nnet_conf.init_scale) model = LSTM_Model(self.nnet_conf) mean_loss, ce_loss, rnn_keep_state_op, rnn_state_zero_op, label_error_rate, softval = model.ce_train( self.X, self.Y, self.seq_len) if self.use_sgd and self.use_normal: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(mean_loss, tvars), self.nnet_conf.grad_clip) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework. get_or_create_global_step()) else: train_op = optimizer.minimize(mean_loss) run_op = { 'train_op': train_op, 'mean_loss': mean_loss, 'ce_loss': ce_loss, 'rnn_keep_state_op': rnn_keep_state_op, 'rnn_state_zero_op': rnn_state_zero_op, 'label_error_rate': label_error_rate, 'softval': softval } self.run_ops.append(run_op) tf.get_variable_scope().reuse_variables() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95) self.sess = tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=self.num_threads, allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) tmp_variables = tf.trainable_variables() self.saver = tf.train.Saver(tmp_variables, max_to_keep=100) #self.saver = tf.train.Saver(max_to_keep=100) if self.restore_training: self.sess.run(init) ckpt = tf.train.get_checkpoint_state( self.tf_async_model_prefix) if ckpt and ckpt.model_checkpoint_path: logging.info("restore training") self.saver.restore(self.sess, ckpt.model_checkpoint_path) self.num_batch_total = self.get_num( ckpt.model_checkpoint_path) if self.print_trainable_variables == True: print_trainable_variables( self.sess, ckpt.model_checkpoint_path + '.txt') sys.exit(0) logging.info('model:' + ckpt.model_checkpoint_path) logging.info('restore learn_rate:' + str(self.sess.run(self.learning_rate_var))) #print('*******************',self.num_batch_total) #time.sleep(3) #model_48434.ckpt.final #print("ckpt.model_checkpoint_path",ckpt.model_checkpoint_path) #print("self.tf_async_model_prefix",self.tf_async_model_prefix) #self.saver.restore(self.sess, self.tf_async_model_prefix) else: logging.info('No checkpoint file found') self.sess.run(init) logging.info('init learn_rate:' + str(self.sess.run(self.learning_rate_var))) else: self.sess.run(init) self.total_variables = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]) logging.info('total parameters : %d' % self.total_variables)