def train_net(): #------进入计算图-------- x_train, y_train, x_val, y_val = input_data.read_img( FILEPATH, WIDTH, HEIGHT, CHANNELS, ratio) x_train_batch, y_train_batch = input_data.bulid_batch( x_train, y_train, BATCH_SIZE) x_val_batch, y_val_batch = input_data.bulid_batch(x_val, y_val, BATCH_SIZE) batch_train_len = x_train_batch.shape[0] batch_val_len = x_val_batch.shape[0] #定义网络 x为输入占位符 y为输出占位符 #image_max = tf.reduce_max(x_train, name='image_max') #image_min = tf.reduce_min(x_train,name='image_min') x = tf.placeholder(tf.float32, shape=[BATCH_SIZE, HEIGHT, WIDTH, CHANNELS], name='input') y = tf.placeholder(tf.int64, shape=[BATCH_SIZE], name='labels_placeholder') _, _, softmax_linear = model.build_network(x, True, False) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\ (logits=softmax_linear, labels=y, name='xentropy_per_example') train_loss = tf.reduce_mean(cross_entropy, name='loss') if modeltype == 'NOQUANT': goto.end #fake quant插入到op之前 tf.contrib.quantize.create_training_graph( input_graph=tf.get_default_graph(), quant_delay=2000) label.end train_step = trainning(train_loss, LEARNING_RATE) #准确略计算 correct = tf.nn.in_top_k(softmax_linear, y, 1) correct = tf.cast(correct, tf.float16) train_acc = tf.reduce_mean(correct) #------------结束计算图------------- with tf.Session() as sess: saver = tf.compat.v1.train.Saver() sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) valstep = 0 #max = sess.run(image_max) #min = sess.run(image_min) #训练 try: ckpt = tf.train.get_checkpoint_state(TRAIN_LOGS_DIR) global_step = 0 if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) for i in range(MAX_STEP + 1): #if_train = True pos = i % batch_train_len _, acc, loss = sess.run([train_step, train_acc, train_loss], feed_dict={ x: x_train_batch[pos], y: y_train_batch[pos] }) #每50步打印一次准确率和损失函数 if i % 50 == 0: print( 'Step %d, train loss = %.2f, train accuracy = %.2f%%' % (i, loss, acc * 100.0)) #每200步用验证集的数据进行验证 if i % 200 == 0: #if_train = False #量化模式下用变量替代占位符.注意 如果要用tflite的话,if_train不要用占位符! vpos = valstep % batch_val_len val_loss, val_acc = sess.run([train_loss, train_acc], feed_dict={ x: x_val_batch[vpos], y: y_val_batch[vpos] }) valstep = valstep + 1 print( '** Step %d, val loss = %.2f, val accuracy = %.2f%% **' % (i, val_loss, val_acc * 100.0)) #每500步保存一次变量值 if i % 500 == 0: checkpoint_path = os.path.join(TRAIN_LOGS_DIR, 'saved_model.ckpt') tmpstep = i + int(global_step) saver.save(sess, checkpoint_path, global_step=tmpstep) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads)
def deep_maxent_irl(): # hyper parameters H = 48 W = 128 N_STATES = H * W N_ACTIONS = 8 SHAPE = [H, W, N_STATES, N_ACTIONS] DISCOUNT = 1 LEARNING_RATE_BASE = 0.001 DECAY_STEPS = 500 DECAY_RATE = 0.99 GRAPH_SAVE_INTERVAL = 50 IMG_PATH = '/home/zhuzeyu/real_datasets/DIRL_DataSets/orig_img/' REF_PATH = '/home/zhuzeyu/real_datasets/DIRL_DataSets/track_ref/' # create model directory MODEL_DIR = "model" if not tf.gfile.Exists(MODEL_DIR): tf.gfile.MakeDirs(MODEL_DIR) # placeholders input_img = tf.placeholder(tf.float32, [None, H, W, 3], name='input_img') grad_r_placeholder = tf.placeholder(tf.float32, [H*W, 1]) # define reward and loss rewards = inference.inference(input_img) rewards_flattened = tf.reshape(rewards, [N_STATES, 1]) theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) l2_loss = tf.reduce_mean([tf.nn.l2_loss(v) for v in theta]) l2_loss = l2_loss/100000.0 loss = tf.multiply(grad_r_placeholder, rewards_flattened) loss = tf.reduce_sum(loss, name='loss') #+ l2_loss # define training global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay( LEARNING_RATE_BASE, global_step, DECAY_STEPS, DECAY_RATE ) optimizer = tf.train.GradientDescentOptimizer(lr) train_step = optimizer.minimize(loss, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=100) extracts = get_queue(IMG_PATH) #path_orig = IMG_PATH + extracts[447] #path_ref = REF_PATH + extracts[447] #terminal, start, img, traj, ref = input_data.read_img(SHAPE, path_orig, path_ref) with tf.Session() as sess: init.run() for epoch in range(20): for iteration in range(100): path_orig = IMG_PATH + extracts[iteration] path_ref = REF_PATH + extracts[iteration] terminal, start, img, traj, ref = input_data.read_img(SHAPE, path_orig, path_ref) # get rewards r = sess.run(rewards, feed_dict={input_img: img}) r_np = np.reshape(r, [-1, ]) # get policy _, policy = value_iteration.value_iteration(SHAPE, r_np, DISCOUNT, terminal) # compute expected svf mu_exp = compute_state_visitation_freq2(SHAPE, traj, start, policy) # compute expert svf # mu_D = demo_svf(traj, N_STATES) mu_D = field_svf(ref) # compute loss grad_r = mu_exp - mu_D index = np.sum(np.abs(grad_r)) grad_r = np.reshape(grad_r, [-1, 1]) # 原来是一维 # train sess.run(train_step, feed_dict={grad_r_placeholder: grad_r, input_img: img}) lss = sess.run(loss, feed_dict={grad_r_placeholder: grad_r, input_img: img}) print(index) print(lss) # print(sess.run(l2_loss)) # save graph if (iteration + 1) % GRAPH_SAVE_INTERVAL == 0: MODEL_NAME = 'model' + str(epoch) + str(iteration) + '.ckpt' saver.save(sess, os.path.join(MODEL_DIR, MODEL_NAME))
import numpy as np import tensorflow as tf from input_data import read_img from input_data import set_val from model import inference train_dir = 'train/' logs_train_dir = 'save/model' #将所有的图片resize成100*100 w=100 h=100 c=3 data,label,num_classes=read_img(train_dir) x_train,y_train,x_val,y_val=set_val(data,label,0.7) #0.7为划分比例 # 占位符 x = tf.placeholder(tf.float32, shape=[None, w, h, c], name='x') y_ = tf.placeholder(tf.int32, shape=[None, ], name='y_') regularizer = tf.contrib.layers.l2_regularizer(0.0001) # 返回一个执行L2正则化的函数.在损失函数上加上正则项是防止过拟合的一个重要方法 logits = inference(x, False, regularizer,num_classes) # (小处理)将logits乘以1赋值给logits_eval,定义name,方便在后续调用模型时通过tensor名字调用输出tensor b = tf.constant(value=1, dtype=tf.float32) logits_eval = tf.multiply(logits, b, name='logits_eval') loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_) train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), y_) # tf.equal Returns:A `Tensor` of type `bool`.
def epoch(weight=None, height=None, dimension=None, class_size=None, train=False, path=None, ratio=None, n_epoch=None, batch_size=None, model_path=None): regularizer = tf.contrib.layers.l2_regularizer(0.001) x = tf.placeholder(tf.float32, shape=[None, weight, height, dimension], name='x') y_ = tf.placeholder(tf.int32, shape=[ None, ], name='y_') # todo # 选用不同的 CNN logits, pred = interface(input_tensor=x, regularizer=regularizer, train=train, class_size=class_size) # logits, pred = cnn(input_sensor=x, class_size=class_size) loss = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits) train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), y_) acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # 输入数据 data, label = input_data.read_img(path, weight, height, dimension) x_train, y_train, x_val, y_val = input_data.shuffle_and_period( data, label, ratio) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() for epoch in range(n_epoch): train_loss, train_acc, n_batch = 0, 0, 0 for x_train_a, y_train_a in input_data.mini_batches(x_train, y_train, batch_size, shuffle=True): _, err, ac = sess.run([train_op, loss, acc], feed_dict={ x: x_train_a, y_: y_train_a }) train_loss += err train_acc += ac n_batch += 1 print('Epoch %d - train loss: %f' % (epoch, (train_loss / n_batch))) print('Epoch %d - train acc: %f' % (epoch, train_acc / n_batch)) # validation val_loss, val_acc, n_batch = 0, 0, 0 for x_val_a, y_val_a in input_data.mini_batches(x_val, y_val, batch_size, shuffle=False): err, ac = sess.run([loss, acc], feed_dict={ x: x_val_a, y_: y_val_a }) val_loss += err val_acc += ac n_batch += 1 print('Epoch %d - Validation loss: %f' % (epoch, val_loss / n_batch)) print('Epoch %d - Validation Accuracy: %f' % (epoch, (val_acc / n_batch))) if epoch % 5 == 0: saver.save(sess, model_path + "save_net.ckpt", epoch) print('Trained Model Saved.')
def main(unused_argv): if FLAGS.job_name is None or FLAGS.job_name == "": raise ValueError("Must specify an explicit `job_name`") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") print("job name = %s" % FLAGS.job_name) print("task index = %d" % FLAGS.task_index) #Construct the cluster and start the server # 读取集群描述信息 ps_spec = FLAGS.ps_hosts.split(",") worker_spec = FLAGS.worker_hosts.split(",") # Get the number of workers. num_workers = len(worker_spec) # 创建TensorFlow集群描述对象 cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec}) # 为本地执行任务创建TensorFlow Server对象。 if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. # 创建本地Sever对象,从tf.train.Server这个定义开始,每个节点开始不同 # 根据执行的命令的参数(作业名字)不同,决定这个任务是哪个任务 # 如果作业名字是ps,进程就加入这里,作为参数更新的服务,等待其他工作节点给它提交参数更新的数据 # 如果作业名字是worker,就执行后面的计算任务 server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) # 如果是参数服务器,直接启动即可。这里,进程就会阻塞在这里 # 下面的tf.train.replica_device_setter代码会将参数批定给ps_server保管 if FLAGS.job_name == "ps": server.join() # 处理工作节点 # 找出worker的主节点,即task_index为0的点 is_chief = (FLAGS.task_index == 0) # 如果使用gpu if FLAGS.num_gpus > 0: # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (FLAGS.task_index % FLAGS.num_gpus) # 分配worker到指定gpu上运行 worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu) # 如果使用cpu elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server # 把cpu分配给worker cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU # 用tf.train.replica_device_setter将涉及变量操作分配到参数服务器上,使用CPU。将涉及非变量操作分配到工作节点上,使用上一步worker_device值。 # 在这个with语句之下定义的参数,会自动分配到参数服务器上去定义。如果有多个参数服务器,就轮流循环分配 with tf.device( tf.train.replica_device_setter(worker_device=worker_device, ps_device="/job:ps/cpu:0", cluster=cluster)): with tf.variable_scope('inputdata') as scope: # 获取图片和标签集 train, train_label = input_data.read_img(FLAGS.train_dir) # 生成批次 train_batch, train_label_batch = input_data.get_batch( train, train_label, FLAGS.IMG_W, FLAGS.IMG_H, FLAGS.BATCH_SIZE, FLAGS.CAPACITY) # 定义全局步长,默认值为0 global_step = tf.Variable(0, name="global_step", trainable=False) train_logits = model.inference(train_batch, FLAGS.BATCH_SIZE, FLAGS.N_CLASSES) train_loss = model.losses(train_logits, train_label_batch) accuracy = model.evaluation(train_logits, train_label_batch) # merge all summaries into a single "operation" which we can execute in a session summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() print("Variables initialized ...") # 异步训练模式:自己计算完成梯度就去更新参数,不同副本之间不会去协调进度 opt = tf.train.AdamOptimizer(FLAGS.learning_rate) # 同步训练模式 if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate # 使用SyncReplicasOptimizer作优化器,并且是在图间复制情况下 # 在图内复制情况下将所有梯度平均 opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="mnist_sync_replicas") train_step = opt.minimize(train_loss, global_step=global_step) if FLAGS.sync_replicas: local_init_op = opt.local_step_init_op if is_chief: # 所有进行计算工作节点里一个主工作节点(chief) # 主节点负责初始化参数、模型保存、概要保存 local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode # 同步训练模式所需初始令牌、主队列 chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() if FLAGS.sync_replicas: # 创建一个监管程序,用于统计训练模型过程中的信息 # lodger 是保存和加载模型路径 # 启动就会去这个logdir目录看是否有检查点文件,有的话就自动加载 # 没有就用init_op指定初始化参数 # 主工作节点(chief)负责模型参数初始化工作 # 过程中,其他工作节点等待主节眯完成初始化工作,初始化完成后,一起开始训练数据 # global_step值是所有计算节点共享的 # 在执行损失函数最小值时自动加1,通过global_step知道所有计算节点一共计算多少步 sv = tf.train.Supervisor( is_chief=is_chief, logdir=FLAGS.logs_train_dir, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, recovery_wait_secs=1, global_step=global_step) else: sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.logs_train_dir, init_op=init_op, recovery_wait_secs=1, global_step=global_step) # 创建会话,设置属性allow_soft_placement为True # 所有操作默认使用被指定设置,如GPU # 如果该操作函数没有GPU实现,自动使用CPU设备 sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, device_filters=[ "/job:ps", "/job:worker/task:%d" % FLAGS.task_index ]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. # 主工作节点(chief),task_index为0节点初始化会话 # 其余工作节点等待会话被初始化后进行计算 if is_chief: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) if FLAGS.existing_servers: server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index] print("Using existing server at: %s" % server_grpc_url) # 创建TensorFlow会话对象,用于执行TensorFlow图计算 # prepare_or_wait_for_session需要参数初始化完成且主节点准备好后,才开始训练 sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % FLAGS.task_index) if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op. sess.run(sync_init_op) global threads threads = sv.start_queue_runners(sess, [chief_queue_runner]) else: threads = sv.start_queue_runners(sess) # Perform training # 执行分布式模型训练 time_begin = time.time() coord = tf.train.Coordinator() print("Training begins @ %f" % time_begin) local_step = 0 try: for step in np.arange(FLAGS.MAX_STEP): if coord.should_stop(): break _, tra_loss, tra_acc = sess.run( [train_step, train_loss, accuracy]) if step % 50 == 0: print( 'Step %d, train loss = %.2f, train accuracy = %.2f%%' % (step, tra_loss, tra_acc * 100.0)) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads) sess.close() time_end = time.time() print("Training ends @ %f" % time_end) training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time)
import img_utils import numpy as np import mdp.value_iteration4 as vi import mdp.gridworld2 as gridworld H = 24 W = 64 N_STATES = H * W TRAJ_LEN = 4 def int_to_point(i): return i % W, i // W __, img, _ = input_data.read_img(H, W) input_img = tf.placeholder(tf.float32, [None, H, W, 3]) sess = tf.Session() # load meta graph and restore weights saver = tf.train.import_meta_graph( '/Users/David/Desktop/model/model24*64/ckpt/model119.ckpt.meta') saver.restore(sess, '/Users/David/Desktop/model/model24*64/ckpt/model119.ckpt') # get all tensors. not necessary graph = tf.get_default_graph() tensor_name_list = [ tensor.name for tensor in tf.get_default_graph().as_graph_def().node ] print(tensor_name_list)