def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars() ) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients() self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list() ) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars() ) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input)
def show_torus_ring(): pyosr.init() dpy = pyosr.create_display() glctx = pyosr.create_gl_context(dpy) g = tf.Graph() util.mkdir_p(ckpt_dir) with g.as_default(): learning_rate_input = tf.placeholder(tf.float32) grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) masterdriver = rldriver.RLDriver(MODELS, init_state, view_config, config.SV_VISCFG, config.MV_VISCFG, use_rgb=True) global_step = tf.contrib.framework.get_or_create_global_step() increment_global_step = tf.assign_add(global_step, 1, name='increment_global_step') saver = tf.train.Saver(masterdriver.get_nn_args() + [global_step]) last_time = time.time() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(checkpoint_dir=ckpt_dir) print('ckpt {}'.format(ckpt)) epoch = 0 policy_before, value_before, _, _ = masterdriver.evaluate(sess) #print("Last b before {}".format(sess.run(masterdriver.get_nn_args()[-2]))) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) epoch = sess.run(global_step) print('Restored!, global_step {}'.format(epoch)) else: print('Cannot find checkpoint at {}'.format(ckpt_dir)) return policy_after, value_after, _, _ = masterdriver.evaluate(sess) print("Value Before Restoring {} and After {}".format(value_before, value_after)) # print("Last b {}".format(sess.run(masterdriver.get_nn_args()[-2]))) driver = masterdriver r = masterdriver.renderer fig = plt.figure() class ReAnimator(object): reaching_terminal = False driver = None im = None sess = None def __init__(self, driver, sess): self.driver = driver self.sess = sess def perform(self, framedata): driver = self.driver r = driver.renderer sess = self.sess if not self.reaching_terminal: policy, value, img, dep = driver.evaluate(sess) policy = policy.reshape(driver.action_size) action = driver.make_decision(policy, sess) nstate,reward,self.reaching_terminal = driver.get_reward(action) valid = r.is_valid_state(nstate) print('Current Value {} Policy {} Action {} Reward {}'.format(value, policy, action, reward)) print('\tNew State {} Collision Free ? {}'.format(nstate, valid)) # print('Action {}, New State {}'.format(action, nstate)) rgb = np.squeeze(img[0, 0, :, : ,:], axis=[0,1]) if self.im is None: print('rgb {}'.format(rgb.shape)) self.im = plt.imshow(rgb) else: self.im.set_array(rgb) r.state = nstate ra = ReAnimator(driver, sess) ani = animation.FuncAnimation(fig, ra.perform) plt.show()
stop_requested = False if USE_LSTM: global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) else: global_network = GameACFFNetwork(ACTION_SIZE, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device = device) training_threads.append(training_thread) # prepare session config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth=True sess = tf.Session(config=config)
device = "/cpu:0" initial_learning_rates = log_uniform(settings.INITIAL_ALPHA_LOW, settings.INITIAL_ALPHA_HIGH, settings.INITIAL_ALPHA_LOG_RATE) stop_requested = False print("Creating the global network...") global_network = Network(0, device) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=settings.RMSP_ALPHA, momentum=0.0, epsilon=settings.RMSP_EPSILON, clip_norm=settings.MAX_GRADIENT_NORM, device=device) print("Global network created !") # Create and initialize the workers workers = [] for i in range(settings.NB_THREADS): print("\nCreating worker %i..." % (i + 1)) worker = Agent(i + 1, global_network, initial_learning_rates, learning_rate_input, grad_applier, device) workers.append(worker) print("\nEvery worker has been created !") # prepare session
global_t = 0 stop_requested = False if settings.agent_type == 'LSTM': global_network = GameACLSTMNetwork(settings.action_size, -1, device) else: global_network = GameACFFNetwork(settings.action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=settings.rmsp_alpha, momentum=0.0, epsilon=settings.rmsp_epsilon, clip_norm=settings.grad_norm_clip, device=device) for i in range(settings.parallel_agent_size): training_thread = A3CTrainingThread( i, global_network, initial_learning_rates[i], learning_rate_input, grad_applier, settings.max_time_step, device, settings.action_size, settings.gamma, settings.local_t_max, settings.entropy_beta, settings.agent_type, settings.performance_log_interval, settings.log_level, settings.random_seed) training_threads.append(training_thread) # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
def train(): #initial learning rate pinitial_learning_rate = log_uniform(PINITIAL_ALPHA_LOW, PINITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) vinitial_learning_rate = log_uniform(VINITIAL_ALPHA_LOW, VINITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) # parameter server and worker information ps_hosts = np.zeros(FLAGS.ps_hosts_num, dtype=object) worker_hosts = np.zeros(FLAGS.worker_hosts_num, dtype=object) port_num = FLAGS.st_port_num for i in range(FLAGS.ps_hosts_num): ps_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 for i in range(FLAGS.worker_hosts_num): worker_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 ps_hosts = list(ps_hosts) worker_hosts = list(worker_hosts) # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": device = tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster) plearning_rate_input = tf.placeholder("float") vlearning_rate_input = tf.placeholder("float") pgrad_applier = RMSPropApplier(learning_rate=plearning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) vgrad_applier = RMSPropApplier(learning_rate=vlearning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) tf.set_random_seed(1) #There are no global network training_thread = A3CTrainingThread(0, "", pinitial_learning_rate, plearning_rate_input, pgrad_applier, vinitial_learning_rate, vlearning_rate_input, vgrad_applier, MAX_TIME_STEP, device=device, task_index=FLAGS.task_index) # prepare session with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) global_step_ph = tf.placeholder(global_step.dtype, shape=global_step.get_shape()) global_step_ops = global_step.assign(global_step_ph) score = tf.get_variable('score', [], initializer=tf.constant_initializer(-21), trainable=False) score_ph = tf.placeholder(score.dtype, shape=score.get_shape()) score_ops = score.assign(score_ph) init_op = tf.global_variables_initializer() # summary for tensorboard tf.summary.scalar("score", score) summary_op = tf.summary.merge_all() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), global_step=global_step, logdir=LOG_FILE, summary_op=summary_op, saver=saver, init_op=init_op) with sv.managed_session(server.target) as sess: # set start_time wall_t = 0.0 start_time = time.time() - wall_t training_thread.set_start_time(start_time) local_t = 0 while True: if sess.run([global_step])[0] > MAX_TIME_STEP: break diff_global_t = training_thread.process( sess, sess.run([global_step])[0], "", summary_op, "", score_ph, score_ops) sess.run(global_step_ops, { global_step_ph: sess.run([global_step])[0] + diff_global_t }) local_t += diff_global_t sv.stop() print("Done")
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars() ) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients() self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list() ) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars() ) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={ self.score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run( self.policy_reset_gradients ) sess.run( self.value_reset_gradients ) # shared から localにweightをコピー sess.run( self.sync ) start_local_t = self.local_t # 5回ループ for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run( self.policy_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] } ) sess.run( self.value_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.r: [R] } ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.policy_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) sess.run( self.value_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
def visualize(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip, agent_type, action_size, rand_seed, checkpoint_dir): # use CPU for weight visualize tool device = "/cpu:0" if agent_type == 'LSTM': global_network = GameACLSTMNetwork(action_size, -1, device) else: global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=rmsp_alpha, momentum=0.0, epsilon=rmsp_epsilon, clip_norm=grad_norm_clip, device=device) game = GameState(rand_seed, action_size) game.process(0) x_t = game.x_t plt.imshow(x_t, interpolation="nearest", cmap=plt.cm.gray) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") W_conv1 = sess.run(global_network.W_conv1) # show graph of W_conv1 fig, axes = plt.subplots(4, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(4 * 16)): inch = i // 16 outch = i % 16 img = W_conv1[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() W_conv2 = sess.run(global_network.W_conv2) # show graph of W_conv2 fig, axes = plt.subplots(2, 32, figsize=(27, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(2 * 32)): inch = i // 32 outch = i % 32 img = W_conv2[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() arr = sess.run(global_network.get_vars()) s = tf.placeholder("float", [None, 84, 84, 4]) b_conv1 = sess.run(global_network.b_conv1) b_conv2 = sess.run(global_network.b_conv2) inp_1 = tf.nn.conv2d(s, W_conv1, strides=[1, 4, 4, 1], padding="VALID") h_conv1 = tf.nn.relu(inp_1 + b_conv1) inp_2 = tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding="VALID") h_conv2 = tf.nn.relu(inp_2 + b_conv2) s_t = game.s_t getActivations(sess, s, h_conv1, s_t, 16) getActivations(sess, s, h_conv2, s_t, 32)
if __name__ == "__main__": if len(sys.argv) != 2: print ("Usage %s <checkpoint-name>" % sys.argv[0]) else: # use CPU for display tool device = "/cpu:0" global_network = MasterNetwork(Constants.NUM_ACTIONS, device) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = Constants.RMSP.ALPHA, momentum = 0.0, epsilon = Constants.RMSP.EPSILON, clip_norm = Constants.RMSP.GRADIENT_NORM_CLIP, device = device) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(sys.argv[1]) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint")
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars()) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients( ) self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list()) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars()) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={self.score_input: score}) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run(self.policy_reset_gradients) sess.run(self.value_reset_gradients) # shared から localにweightをコピー sess.run(self.sync) start_local_t = self.local_t # 5回ループ for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run(self.policy_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] }) sess.run(self.value_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.r: [R] }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.policy_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) sess.run(self.value_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.reward_collector = [] self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(action_size, -1, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=flags.rmsp_alpha, momentum=0.0, epsilon=flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) for i in range(flags.parallel_size): print('building trainer', i) trainer = Trainer( i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device, self.reward_collector) self.trainers.append(trainer) print('') # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", self.score_input) self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(flags.log_file, self.sess.graph) # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) # Loading script checkpoint = tf.train.get_checkpoint_state(flags.load_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.load_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step # run training threads self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append( threading.Thread(target=self.train_function, args=(i, True))) #signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t for t in self.train_threads: t.start()
def aa_train_main(args): ckpt_dir = args.ckptdir ckpt_prefix = args.ckptprefix device = args.device pyosr.init() dpy = pyosr.create_display() glctx = pyosr.create_gl_context(dpy) g = tf.Graph() util.mkdir_p(ckpt_dir) with g.as_default(): learning_rate_input = tf.placeholder(tf.float32) grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) masterdriver = rldriver.RLDriver(MODELS, init_state, view_config, config.SV_VISCFG, config.MV_VISCFG, output_number=AA_OUTPUT_NUMBER, use_rgb=True, continuous_policy_loss=True) driver = rldriver.RLDriver(MODELS, init_state, view_config, config.SV_VISCFG, config.MV_VISCFG, output_number=AA_OUTPUT_NUMBER, use_rgb=True, master_driver=masterdriver, grads_applier=grad_applier, continuous_policy_loss=True) driver.get_sync_from_master_op() driver.get_apply_grads_op() driver.learning_rate_input = learning_rate_input driver.a3c_local_t = 32 global_step = tf.contrib.framework.get_or_create_global_step() increment_global_step = tf.assign_add(global_step, 1, name='increment_global_step') saver = tf.train.Saver(masterdriver.get_nn_args() + [global_step]) last_time = time.time() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(checkpoint_dir=ckpt_dir) print('ckpt {}'.format(ckpt)) epoch = 0 if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) epoch = sess.run(global_step) print('Restored!, global_step {}'.format(epoch)) while epoch < args.iter: fn = "{}/{}{:06d}.npz".format(args.path, args.prefix, epoch % args.gtnumber) dic = np.load(fn) driver.train_from_gt(sess, dic['KEYS'], dic['TR'], dic['ROT'], dic['DIST']) epoch += 1 sess.run(increment_global_step) if epoch % 1000 == 0 or time.time() - last_time >= 10 * 60: print("Saving checkpoint") fn = saver.save(sess, ckpt_dir + ckpt_prefix, global_step=global_step) print("Saved checkpoint to {}".format(fn)) last_time = time.time() print("Epoch {}".format(epoch))
values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 global_network = GameACNetwork(ACTION_SIZE) learning_rate_input = tf.placeholder("float") policy_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) value_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) training_threads = [] for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, 1.0, learning_rate_input, policy_applier, value_applier, 8000000) training_threads.append(training_thread) sess = tf.Session()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): ''' ''' ################## # shared policy # ################## tic = time.clock() manarger = MPManager() manarger.start() shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name) shared_obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories shared_logger = Logger(logname=env_name, now=now + "-Master") shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master") #env = wrappers.Monitor(env, aigym_path, force=True) shared_scaler = Scaler(shared_obs_dim) shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None) shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult, policy_logvar, -1, None) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) # lacal policy declair env_a = [None] * N_WORKERS obs_dim_a = [None] * N_WORKERS act_dim_a = [None] * N_WORKERS logger_a = [None] * N_WORKERS aigym_path_a = [None] * N_WORKERS now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories val_func_a = [None] * N_WORKERS policy_a = [None] * N_WORKERS scaler_a = [None] * N_WORKERS for i in range(N_WORKERS): env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name) obs_dim_a[ i] += 1 # add 1 to obs dimension for time step feature (see run_episode()) logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i)) aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i)) #env_a[i] = wrappers.Monitor(env, aigym_path, force=True) scaler_a[i] = Scaler(obs_dim_a[i]) val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i, shared_val_func) val_func_a[i].apply_gradients = grad_applier.apply_gradients( shared_val_func.get_vars(), val_func_a[i].gradients) policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult, policy_logvar, i, shared_policy) policy_a[i].apply_gradients = grad_applier.apply_gradients( shared_policy.get_vars(), policy_a[i].gradients) # init tensorflow sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() ## start sess sess.run(init) ## init shared scalar policy run_policy(sess, shared_env, shared_policy, shared_scaler, shared_logger, episodes=5) def single_work(thread_idx): """ training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env = env_a[thread_idx] policy = policy_a[thread_idx] #obs_dim = obs_dim_a[thread_idx] #act_dim = act_dim_a[thread_idx] logger = logger_a[thread_idx] aigym_path = aigym_path_a[thread_idx] scaler = scaler_a[thread_idx] val_func = val_func_a[thread_idx] print("=== start thread " + str(policy.get_thread_idx()) + " " + policy.get_scope() + " ===") print(shared_policy.get_vars()) print(policy.get_vars()) # run a few episodes of untrained policy to initialize scaler: #run_policy(sess, env, policy, scaler, logger, episodes=5) #policy.sync(shared_policy) #val_func.sync(shared_val_func) episode = 0 while episode < num_episodes: ## copy global var into local sess.run(policy.sync) sess.run(val_func.sync) ## compute new model on local policy trajectories = run_policy(sess, env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(sess, trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, time.clock() - tic) policy.update(sess, observes, actions, advantages, logger) # update policy val_func.fit(sess, observes, disc_sum_rew, logger) # update value function #cur_learning_rate = self._anneal_learning_rate(global_t) feed_dict = { policy.old_log_vars_ph: policy.old_log_vars_np, policy.old_means_ph: policy.old_means_np, policy.obs_ph: observes, policy.act_ph: actions, policy.advantages_ph: advantages, policy.beta_ph: policy.beta, policy.lr_ph: policy.lr, policy.eta_ph: policy.eta, learning_rate_input: policy.lr } sess.run(policy.apply_gradients, feed_dict) shared_policy.update(sess, observes, actions, advantages, shared_logger) feed_dict = { val_func.obs_ph: observes, val_func.val_ph: disc_sum_rew, learning_rate_input: val_func.lr } sess.run(val_func.apply_gradients, feed_dict) shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger) shared_logger.log({'_Time': time.clock() - tic}) logger.write( display=True) # write logger results to file and stdout logger.close() ## end def single work train_threads = [] for i in range(N_WORKERS): train_threads.append(threading.Thread(target=single_work, args=(i, ))) [t.start() for t in train_threads] [t.join() for t in train_threads] saver = tf.train.Saver() for i in range(N_WORKERS): logger_a[i].close() #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint') #saver.save(sess, path ) sess.close()
def train(): #initial learning rate initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) # parameter server and worker information ps_hosts = np.zeros(FLAGS.ps_hosts_num,dtype=object); worker_hosts = np.zeros(FLAGS.worker_hosts_num,dtype=object); port_num=FLAGS.st_port_num; for i in range(FLAGS.ps_hosts_num): ps_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); port_num+=1; for i in range(FLAGS.worker_hosts_num): worker_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); port_num+=1; ps_hosts=list(ps_hosts); worker_hosts=list(worker_hosts); # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join(); elif FLAGS.job_name == "worker": # gpu_assignment = FLAGS.task_index % NUM_GPUS # print("Assigning worker #%d to GPU #%d" % (FLAGS.task_index, gpu_assignment)) # device=tf.train.replica_device_setter( # worker_device="/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu_assignment), # cluster=cluster); device=tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster); learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) tf.set_random_seed(1); #There are no global network #lock = multiprocessing.Lock() #wrapper = ToDiscrete('constant-7') #env = wrapper(gym.make('gym_doom/DoomBasic-v0')) #env.close() training_thread = A3CTrainingThread(0,"",0,initial_learning_rate,learning_rate_input,grad_applier,MAX_TIME_STEP,device=device,FLAGS=FLAGS,task_index=FLAGS.task_index) # prepare session with tf.device(device): # flag for task flag = tf.get_variable('flag',[],initializer=tf.constant_initializer(0),trainable=False); flag_ph=tf.placeholder(flag.dtype,shape=flag.get_shape()); flag_ops=flag.assign(flag_ph); # global step global_step = tf.get_variable('global_step',[],initializer=tf.constant_initializer(0),trainable=False); global_step_ph=tf.placeholder(global_step.dtype,shape=global_step.get_shape()); global_step_ops=global_step.assign(global_step_ph); # score for tensorboard and score_set for genetic algorithm score = tf.get_variable('score',[],initializer=tf.constant_initializer(-21),trainable=False); score_ph=tf.placeholder(score.dtype,shape=score.get_shape()); score_ops=score.assign(score_ph); score_set=np.zeros(FLAGS.worker_hosts_num,dtype=object); score_set_ph=np.zeros(FLAGS.worker_hosts_num,dtype=object); score_set_ops=np.zeros(FLAGS.worker_hosts_num,dtype=object); for i in range(FLAGS.worker_hosts_num): score_set[i] = tf.get_variable('score'+str(i),[],initializer=tf.constant_initializer(-1000),trainable=False); score_set_ph[i]=tf.placeholder(score_set[i].dtype,shape=score_set[i].get_shape()); score_set_ops[i]=score_set[i].assign(score_set_ph[i]); # fixed path of earlier task fixed_path_tf=np.zeros((FLAGS.L,FLAGS.M),dtype=object); fixed_path_ph=np.zeros((FLAGS.L,FLAGS.M),dtype=object); fixed_path_ops=np.zeros((FLAGS.L,FLAGS.M),dtype=object); for i in range(FLAGS.L): for j in range(FLAGS.M): fixed_path_tf[i,j]=tf.get_variable('fixed_path'+str(i)+"-"+str(j),[],initializer=tf.constant_initializer(0),trainable=False); fixed_path_ph[i,j]=tf.placeholder(fixed_path_tf[i,j].dtype,shape=fixed_path_tf[i,j].get_shape()); fixed_path_ops[i,j]=fixed_path_tf[i,j].assign(fixed_path_ph[i,j]); # parameters on PathNet vars_=training_thread.local_network.get_vars(); vars_ph=np.zeros(len(vars_),dtype=object); vars_ops=np.zeros(len(vars_),dtype=object); for i in range(len(vars_)): vars_ph[i]=tf.placeholder(vars_[i].dtype,shape=vars_[i].get_shape()); vars_ops[i]=vars_[i].assign(vars_ph[i]); # initialization init_op=tf.global_variables_initializer(); # summary for tensorboard tf.summary.scalar("score", score); summary_op = tf.summary.merge_all() saver = tf.train.Saver(); sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), global_step=global_step, logdir=FLAGS.log_dir, summary_op=summary_op, saver=saver, init_op=init_op) try: os.mkdir("./data/graphs") except: pass # config = tf.ConfigProto( # device_count = {'GPU': 0} # ) # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.1 with sv.managed_session(server.target) as sess: if(FLAGS.task_index!=(FLAGS.worker_hosts_num-1)): for task in range(2): training_thread.set_training_stage(task) while sess.run([flag])[0] != (task+1): time.sleep(2) # Set fixed_path fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for i in range(FLAGS.L): for j in range(FLAGS.M): if(sess.run([fixed_path_tf[i,j]])[0]==1): fixed_path[i,j]=1.0; training_thread.local_network.set_fixed_path(fixed_path); # set start_time wall_t=0.0; start_time = time.time() - wall_t training_thread.set_start_time(start_time) while True: if sess.run([global_step])[0] > (MAX_TIME_STEP*(task+1)): break diff_global_t = training_thread.process(sess, sess.run([global_step])[0], "", summary_op, "",score_ph,score_ops,"",FLAGS,score_set_ph[FLAGS.task_index],score_set_ops[FLAGS.task_index]) sess.run(global_step_ops,{global_step_ph:sess.run([global_step])[0]+diff_global_t}); else: fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float) vars_backup=np.zeros(len(vars_),dtype=object) vars_backup=sess.run(vars_) winner_idx=0 vis = visualize.GraphVisualize([FLAGS.M] * FLAGS.L, True) for task in range(2): # Generating randomly geopath geopath_set=np.zeros(FLAGS.worker_hosts_num-1,dtype=object); for i in range(FLAGS.worker_hosts_num-1): geopath_set[i]=pathnet.get_geopath(FLAGS.L,FLAGS.M,FLAGS.N); tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for j in range(FLAGS.L): for k in range(FLAGS.M): if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)): tmp[j,k]=1.0; pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M); print("Geopath Setting Done"); sess.run(flag_ops,{flag_ph:(task+1)}); print("=============Task "+str(task+1)+"============"); score_subset=np.zeros(FLAGS.B,dtype=float); score_set_print=np.zeros(FLAGS.worker_hosts_num,dtype=float); rand_idx=np.arange(FLAGS.worker_hosts_num-1); np.random.shuffle(rand_idx); rand_idx=rand_idx[:FLAGS.B]; while sess.run([global_step])[0] <= (MAX_TIME_STEP*(task+1)): # if (sess.run([global_step])[0]) % 1000 == 0: # print("Saving summary...") # tf.logging.info('Running Summary operation on the chief.') # summary_str = sess.run(summary_op) # sv.summary_computed(sess, summary_str) # tf.logging.info('Finished running Summary operation.') # # # Determine the next time for running the summary. decodePath = lambda p: [np.where(l==1.0)[0] for l in p] flag_sum=0; for i in range(FLAGS.worker_hosts_num-1): score_set_print[i]=sess.run([score_set[i]])[0]; for i in range(len(rand_idx)): score_subset[i]=sess.run([score_set[rand_idx[i]]])[0]; if(score_subset[i]==-1000): flag_sum=1; break; if(flag_sum==0): vispaths = [np.array(decodePath(p)) for p in geopath_set] vis.show(vispaths, 'm') winner_idx=rand_idx[np.argmax(score_subset)]; print(str(sess.run([global_step])[0])+" Step Score: "+str(sess.run([score_set[winner_idx]])[0])); for i in rand_idx: if(i!=winner_idx): geopath_set[i]=np.copy(geopath_set[winner_idx]); geopath_set[i]=pathnet.mutation(geopath_set[i],FLAGS.L,FLAGS.M,FLAGS.N); tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for j in range(FLAGS.L): for k in range(FLAGS.M): if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)): tmp[j,k]=1.0; pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M); sess.run(score_set_ops[i],{score_set_ph[i]:-1000}) rand_idx=np.arange(FLAGS.worker_hosts_num-1) np.random.shuffle(rand_idx) rand_idx=rand_idx[:FLAGS.B] else: time.sleep(2); # fixed_path setting fixed_path=geopath_set[winner_idx] vis.set_fixed(decodePath(fixed_path), 'r' if task == 0 else 'g') vis.show(vispaths, 'm') print('fix') for i in range(FLAGS.L): for j in range(FLAGS.M): if(fixed_path[i,j]==1.0): sess.run(fixed_path_ops[i,j],{fixed_path_ph[i,j]:1}); training_thread.local_network.set_fixed_path(fixed_path); # backup fixed vars # FIXED_VARS_BACKUP = training_thread.local_network.get_fixed_vars(); # FIXED_VARS_IDX_BACKUP = training_thread.local_network.get_fixed_vars_idx(); # initialization of parameters except fixed_path vars_idx=training_thread.local_network.get_vars_idx(); for i in range(len(vars_idx)): if(vars_idx[i]==1.0): sess.run(vars_ops[i],{vars_ph[i]:vars_backup[i]}); vis.waitForButtonPress() sv.stop();
def train(): #initial learning rate initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) # parameter server and worker information ps_hosts = np.zeros(FLAGS.ps_hosts_num,dtype=object); worker_hosts = np.zeros(FLAGS.worker_hosts_num,dtype=object); port_num=FLAGS.st_port_num; for i in range(FLAGS.ps_hosts_num): ps_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); port_num+=1; for i in range(FLAGS.worker_hosts_num): worker_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); port_num+=1; ps_hosts=list(ps_hosts); worker_hosts=list(worker_hosts); # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join(); elif FLAGS.job_name == "worker": device=tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster); learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) tf.set_random_seed(1); #There are no global network training_thread = A3CTrainingThread(0, "", initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device = device,FLAGS=FLAGS,task_index=FLAGS.task_index) # prepare session with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): # flag for task flag = tf.get_variable('flag',[],initializer=tf.constant_initializer(0),trainable=False); flag_ph=tf.placeholder(flag.dtype,shape=flag.get_shape()); flag_ops=flag.assign(flag_ph); # global step global_step = tf.get_variable('global_step',[],initializer=tf.constant_initializer(0),trainable=False); global_step_ph=tf.placeholder(global_step.dtype,shape=global_step.get_shape()); global_step_ops=global_step.assign(global_step_ph); # score for tensorboard and score_set for genetic algorithm score = tf.get_variable('score',[],initializer=tf.constant_initializer(-21),trainable=False); score_ph=tf.placeholder(score.dtype,shape=score.get_shape()); score_ops=score.assign(score_ph); score_set=np.zeros(FLAGS.worker_hosts_num,dtype=object); score_set_ph=np.zeros(FLAGS.worker_hosts_num,dtype=object); score_set_ops=np.zeros(FLAGS.worker_hosts_num,dtype=object); for i in range(FLAGS.worker_hosts_num): score_set[i] = tf.get_variable('score'+str(i),[],initializer=tf.constant_initializer(-1000),trainable=False); score_set_ph[i]=tf.placeholder(score_set[i].dtype,shape=score_set[i].get_shape()); score_set_ops[i]=score_set[i].assign(score_set_ph[i]); # fixed path of earlier task fixed_path_tf=np.zeros((FLAGS.L,FLAGS.M),dtype=object); fixed_path_ph=np.zeros((FLAGS.L,FLAGS.M),dtype=object); fixed_path_ops=np.zeros((FLAGS.L,FLAGS.M),dtype=object); for i in range(FLAGS.L): for j in range(FLAGS.M): fixed_path_tf[i,j]=tf.get_variable('fixed_path'+str(i)+"-"+str(j),[],initializer=tf.constant_initializer(0),trainable=False); fixed_path_ph[i,j]=tf.placeholder(fixed_path_tf[i,j].dtype,shape=fixed_path_tf[i,j].get_shape()); fixed_path_ops[i,j]=fixed_path_tf[i,j].assign(fixed_path_ph[i,j]); # parameters on PathNet vars_=training_thread.local_network.get_vars(); vars_ph=np.zeros(len(vars_),dtype=object); vars_ops=np.zeros(len(vars_),dtype=object); for i in range(len(vars_)): vars_ph[i]=tf.placeholder(vars_[i].dtype,shape=vars_[i].get_shape()); vars_ops[i]=vars_[i].assign(vars_ph[i]); # initialization init_op=tf.global_variables_initializer(); # summary for tensorboard tf.summary.scalar("score", score); summary_op = tf.summary.merge_all() saver = tf.train.Saver(); sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), global_step=global_step, logdir=FLAGS.log_dir, summary_op=summary_op, saver=saver, init_op=init_op) with sv.managed_session(server.target) as sess: if(FLAGS.task_index!=(FLAGS.worker_hosts_num-1)): for task in range(2): while True: if(sess.run([flag])[0]==(task+1)): break; time.sleep(2); # Set fixed_path fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for i in range(FLAGS.L): for j in range(FLAGS.M): if(sess.run([fixed_path_tf[i,j]])[0]==1): fixed_path[i,j]=1.0; training_thread.local_network.set_fixed_path(fixed_path); # set start_time wall_t=0.0; start_time = time.time() - wall_t training_thread.set_start_time(start_time) while True: if sess.run([global_step])[0] > (MAX_TIME_STEP*(task+1)): break diff_global_t = training_thread.process(sess, sess.run([global_step])[0], "", summary_op, "",score_ph,score_ops,"",FLAGS,score_set_ph[FLAGS.task_index],score_set_ops[FLAGS.task_index]) sess.run(global_step_ops,{global_step_ph:sess.run([global_step])[0]+diff_global_t}); else: fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float); vars_backup=np.zeros(len(vars_),dtype=object); vars_backup=sess.run(vars_); winner_idx=0; for task in range(2): # Generating randomly geopath geopath_set=np.zeros(FLAGS.worker_hosts_num-1,dtype=object); for i in range(FLAGS.worker_hosts_num-1): geopath_set[i]=pathnet.get_geopath(FLAGS.L,FLAGS.M,FLAGS.N); tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for j in range(FLAGS.L): for k in range(FLAGS.M): if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)): tmp[j,k]=1.0; pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M); print("Geopath Setting Done"); sess.run(flag_ops,{flag_ph:(task+1)}); print("=============Task"+str(task+1)+"============"); score_subset=np.zeros(FLAGS.B,dtype=float); score_set_print=np.zeros(FLAGS.worker_hosts_num,dtype=float); rand_idx=range(FLAGS.worker_hosts_num-1); np.random.shuffle(rand_idx); rand_idx=rand_idx[:FLAGS.B]; while True: if sess.run([global_step])[0] > (MAX_TIME_STEP*(task+1)): break flag_sum=0; for i in range(FLAGS.worker_hosts_num-1): score_set_print[i]=sess.run([score_set[i]])[0]; print(score_set_print); for i in range(len(rand_idx)): score_subset[i]=sess.run([score_set[rand_idx[i]]])[0]; if(score_subset[i]==-1000): flag_sum=1; break; if(flag_sum==0): winner_idx=rand_idx[np.argmax(score_subset)]; print(str(sess.run([global_step])[0])+" Step Score: "+str(sess.run([score_set[winner_idx]])[0])); for i in rand_idx: if(i!=winner_idx): geopath_set[i]=np.copy(geopath_set[winner_idx]); geopath_set[i]=pathnet.mutation(geopath_set[i],FLAGS.L,FLAGS.M,FLAGS.N); tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for j in range(FLAGS.L): for k in range(FLAGS.M): if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)): tmp[j,k]=1.0; pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M); sess.run(score_set_ops[i],{score_set_ph[i]:-1000}); rand_idx=range(FLAGS.worker_hosts_num-1); np.random.shuffle(rand_idx); rand_idx=rand_idx[:FLAGS.B]; else: time.sleep(5); # fixed_path setting fixed_path=geopath_set[winner_idx]; for i in range(FLAGS.L): for j in range(FLAGS.M): if(fixed_path[i,j]==1.0): sess.run(fixed_path_ops[i,j],{fixed_path_ph[i,j]:1}); training_thread.local_network.set_fixed_path(fixed_path); # initialization of parameters except fixed_path vars_idx=training_thread.local_network.get_vars_idx(); for i in range(len(vars_idx)): if(vars_idx[i]==1.0): sess.run(vars_ops[i],{vars_ph[i]:vars_backup[i]}); sv.stop(); print("Done");