def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step):

    self.thread_index = thread_index
    self.learning_rate_input = tf.placeholder("float")
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # policy
    self.policy_trainer = AccumTrainer()
    self.policy_trainer.prepare_minimize( self.local_network.policy_loss,
                                          self.local_network.get_policy_vars() )
    self.policy_accum_gradients = self.policy_trainer.accumulate_gradients()
    self.policy_reset_gradients = self.policy_trainer.reset_gradients()
  
    self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                         decay = 0.99,
                                         momentum = 0.0,
                                         epsilon = RMSP_EPSILON )
    self.policy_apply_gradients = self.policy_applier.apply_gradients(
        global_network.get_policy_vars(),
        self.policy_trainer.get_accum_grad_list() )

    # value
    self.value_trainer = AccumTrainer()
    self.value_trainer.prepare_minimize( self.local_network.value_loss,
                                         self.local_network.get_value_vars() )
    self.value_accum_gradients = self.value_trainer.accumulate_gradients()
    self.value_reset_gradients = self.value_trainer.reset_gradients()
  
    self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                        decay = 0.99,
                                        momentum = 0.0,
                                        epsilon = RMSP_EPSILON )
    self.value_apply_gradients = self.value_applier.apply_gradients(
        global_network.get_value_vars(),
        self.value_trainer.get_accum_grad_list() )
    
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # thread0 will record score for TensorBoard
    if self.thread_index == 0:
      self.score_input = tf.placeholder(tf.int32)
      tf.scalar_summary("score", self.score_input)
def show_torus_ring():
    pyosr.init()
    dpy = pyosr.create_display()
    glctx = pyosr.create_gl_context(dpy)
    g = tf.Graph()
    util.mkdir_p(ckpt_dir)
    with g.as_default():
        learning_rate_input = tf.placeholder(tf.float32)
        grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                      decay=RMSP_ALPHA,
                                      momentum=0.0,
                                      epsilon=RMSP_EPSILON,
                                      clip_norm=GRAD_NORM_CLIP,
                                      device=device)
        masterdriver = rldriver.RLDriver(MODELS,
                init_state,
                view_config,
                config.SV_VISCFG,
                config.MV_VISCFG,
                use_rgb=True)
        global_step = tf.contrib.framework.get_or_create_global_step()
        increment_global_step = tf.assign_add(global_step, 1, name='increment_global_step')
        saver = tf.train.Saver(masterdriver.get_nn_args() + [global_step])
        last_time = time.time()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(checkpoint_dir=ckpt_dir)
            print('ckpt {}'.format(ckpt))
            epoch = 0
            policy_before, value_before, _, _ = masterdriver.evaluate(sess)
            #print("Last b before {}".format(sess.run(masterdriver.get_nn_args()[-2])))
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                epoch = sess.run(global_step)
                print('Restored!, global_step {}'.format(epoch))
            else:
                print('Cannot find checkpoint at {}'.format(ckpt_dir))
                return
            policy_after, value_after, _, _ = masterdriver.evaluate(sess)
            print("Value Before Restoring {} and After {}".format(value_before, value_after))
            # print("Last b {}".format(sess.run(masterdriver.get_nn_args()[-2])))
            driver = masterdriver
            r = masterdriver.renderer
            fig = plt.figure()
            class ReAnimator(object):
                reaching_terminal = False
                driver = None
                im = None
                sess = None

                def __init__(self, driver, sess):
                    self.driver = driver
                    self.sess = sess

                def perform(self, framedata):
                    driver = self.driver
                    r = driver.renderer
                    sess = self.sess
                    if not self.reaching_terminal:
                        policy, value, img, dep = driver.evaluate(sess)
                        policy = policy.reshape(driver.action_size)
                        action = driver.make_decision(policy, sess)
                        nstate,reward,self.reaching_terminal = driver.get_reward(action)
                        valid = r.is_valid_state(nstate)
                        print('Current Value {} Policy {} Action {} Reward {}'.format(value, policy, action, reward))
                        print('\tNew State {} Collision Free ? {}'.format(nstate, valid))
                        # print('Action {}, New State {}'.format(action, nstate))
                        rgb = np.squeeze(img[0, 0, :, : ,:], axis=[0,1])
                        if self.im is None:
                            print('rgb {}'.format(rgb.shape))
                            self.im = plt.imshow(rgb)
                        else:
                            self.im.set_array(rgb)
                        r.state = nstate
            ra = ReAnimator(driver, sess)
            ani = animation.FuncAnimation(fig, ra.perform)
            plt.show()
예제 #3
0
stop_requested = False

if USE_LSTM:
  global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device)
else:
  global_network = GameACFFNetwork(ACTION_SIZE, -1, device)


training_threads = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = RMSP_ALPHA,
                              momentum = 0.0,
                              epsilon = RMSP_EPSILON,
                              clip_norm = GRAD_NORM_CLIP,
                              device = device)

for i in range(PARALLEL_SIZE):
  training_thread = A3CTrainingThread(i, global_network, initial_learning_rate,
                                      learning_rate_input,
                                      grad_applier, MAX_TIME_STEP,
                                      device = device)
  training_threads.append(training_thread)

# prepare session
config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
예제 #4
0
    device = "/cpu:0"

    initial_learning_rates = log_uniform(settings.INITIAL_ALPHA_LOW,
                                         settings.INITIAL_ALPHA_HIGH,
                                         settings.INITIAL_ALPHA_LOG_RATE)
    stop_requested = False

    print("Creating the global network...")
    global_network = Network(0, device)

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=settings.RMSP_ALPHA,
                                  momentum=0.0,
                                  epsilon=settings.RMSP_EPSILON,
                                  clip_norm=settings.MAX_GRADIENT_NORM,
                                  device=device)
    print("Global network created !")

    # Create and initialize the workers
    workers = []
    for i in range(settings.NB_THREADS):
        print("\nCreating worker %i..." % (i + 1))
        worker = Agent(i + 1, global_network, initial_learning_rates,
                       learning_rate_input, grad_applier, device)

        workers.append(worker)
    print("\nEvery worker has been created !")

    # prepare session
예제 #5
0
    global_t = 0

    stop_requested = False

    if settings.agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(settings.action_size, -1, device)
    else:
        global_network = GameACFFNetwork(settings.action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=settings.rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=settings.rmsp_epsilon,
                                  clip_norm=settings.grad_norm_clip,
                                  device=device)

    for i in range(settings.parallel_agent_size):
        training_thread = A3CTrainingThread(
            i, global_network, initial_learning_rates[i], learning_rate_input,
            grad_applier, settings.max_time_step, device, settings.action_size,
            settings.gamma, settings.local_t_max, settings.entropy_beta,
            settings.agent_type, settings.performance_log_interval,
            settings.log_level, settings.random_seed)

        training_threads.append(training_thread)

    # prepare session
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
예제 #6
0
def train():
    #initial learning rate
    pinitial_learning_rate = log_uniform(PINITIAL_ALPHA_LOW,
                                         PINITIAL_ALPHA_HIGH,
                                         INITIAL_ALPHA_LOG_RATE)
    vinitial_learning_rate = log_uniform(VINITIAL_ALPHA_LOW,
                                         VINITIAL_ALPHA_HIGH,
                                         INITIAL_ALPHA_LOG_RATE)

    # parameter server and worker information
    ps_hosts = np.zeros(FLAGS.ps_hosts_num, dtype=object)
    worker_hosts = np.zeros(FLAGS.worker_hosts_num, dtype=object)
    port_num = FLAGS.st_port_num
    for i in range(FLAGS.ps_hosts_num):
        ps_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num)
        port_num += 1
    for i in range(FLAGS.worker_hosts_num):
        worker_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num)
        port_num += 1
    ps_hosts = list(ps_hosts)
    worker_hosts = list(worker_hosts)
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)

    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":
        device = tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_index,
            cluster=cluster)

        plearning_rate_input = tf.placeholder("float")
        vlearning_rate_input = tf.placeholder("float")

        pgrad_applier = RMSPropApplier(learning_rate=plearning_rate_input,
                                       decay=RMSP_ALPHA,
                                       momentum=0.0,
                                       epsilon=RMSP_EPSILON,
                                       clip_norm=GRAD_NORM_CLIP,
                                       device=device)
        vgrad_applier = RMSPropApplier(learning_rate=vlearning_rate_input,
                                       decay=RMSP_ALPHA,
                                       momentum=0.0,
                                       epsilon=RMSP_EPSILON,
                                       clip_norm=GRAD_NORM_CLIP,
                                       device=device)

        tf.set_random_seed(1)
        #There are no global network
        training_thread = A3CTrainingThread(0,
                                            "",
                                            pinitial_learning_rate,
                                            plearning_rate_input,
                                            pgrad_applier,
                                            vinitial_learning_rate,
                                            vlearning_rate_input,
                                            vgrad_applier,
                                            MAX_TIME_STEP,
                                            device=device,
                                            task_index=FLAGS.task_index)

        # prepare session
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % FLAGS.task_index,
                    cluster=cluster)):
            global_step = tf.get_variable(
                'global_step', [],
                initializer=tf.constant_initializer(0),
                trainable=False)
            global_step_ph = tf.placeholder(global_step.dtype,
                                            shape=global_step.get_shape())
            global_step_ops = global_step.assign(global_step_ph)
            score = tf.get_variable('score', [],
                                    initializer=tf.constant_initializer(-21),
                                    trainable=False)
            score_ph = tf.placeholder(score.dtype, shape=score.get_shape())
            score_ops = score.assign(score_ph)
            init_op = tf.global_variables_initializer()
            # summary for tensorboard
            tf.summary.scalar("score", score)
            summary_op = tf.summary.merge_all()
            saver = tf.train.Saver()

        sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                                 global_step=global_step,
                                 logdir=LOG_FILE,
                                 summary_op=summary_op,
                                 saver=saver,
                                 init_op=init_op)

        with sv.managed_session(server.target) as sess:
            # set start_time
            wall_t = 0.0
            start_time = time.time() - wall_t
            training_thread.set_start_time(start_time)
            local_t = 0
            while True:
                if sess.run([global_step])[0] > MAX_TIME_STEP:
                    break
                diff_global_t = training_thread.process(
                    sess,
                    sess.run([global_step])[0], "", summary_op, "", score_ph,
                    score_ops)
                sess.run(global_step_ops, {
                    global_step_ph:
                    sess.run([global_step])[0] + diff_global_t
                })
                local_t += diff_global_t

        sv.stop()
        print("Done")
class A3CTrainingThread(object):
  def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step):

    self.thread_index = thread_index
    self.learning_rate_input = tf.placeholder("float")
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # policy
    self.policy_trainer = AccumTrainer()
    self.policy_trainer.prepare_minimize( self.local_network.policy_loss,
                                          self.local_network.get_policy_vars() )
    self.policy_accum_gradients = self.policy_trainer.accumulate_gradients()
    self.policy_reset_gradients = self.policy_trainer.reset_gradients()
  
    self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                         decay = 0.99,
                                         momentum = 0.0,
                                         epsilon = RMSP_EPSILON )
    self.policy_apply_gradients = self.policy_applier.apply_gradients(
        global_network.get_policy_vars(),
        self.policy_trainer.get_accum_grad_list() )

    # value
    self.value_trainer = AccumTrainer()
    self.value_trainer.prepare_minimize( self.local_network.value_loss,
                                         self.local_network.get_value_vars() )
    self.value_accum_gradients = self.value_trainer.accumulate_gradients()
    self.value_reset_gradients = self.value_trainer.reset_gradients()
  
    self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                        decay = 0.99,
                                        momentum = 0.0,
                                        epsilon = RMSP_EPSILON )
    self.value_apply_gradients = self.value_applier.apply_gradients(
        global_network.get_value_vars(),
        self.value_trainer.get_accum_grad_list() )
    
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # thread0 will record score for TensorBoard
    if self.thread_index == 0:
      self.score_input = tf.placeholder(tf.int32)
      tf.scalar_summary("score", self.score_input)

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      self.score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # 加算された勾配をリセット
    sess.run( self.policy_reset_gradients )
    sess.run( self.value_reset_gradients )

    # shared から localにweightをコピー
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # 5回ループ
    for i in range(LOCAL_T_MAX):
      pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      value_ = self.local_network.run_value(sess, self.game_state.s_t)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # gameを実行
      self.game_state.process(action)

      # 実行した結果
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      rewards.append(reward)

      self.local_t += 1

      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        if self.thread_index == 0:        
          self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t)
          
        self.episode_reward = 0
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    # 勾配を算出して加算していく
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      sess.run( self.policy_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.a: [a],
                    self.local_network.td: [td] } )
      
      sess.run( self.value_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.r: [R] } )

    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.policy_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )
    sess.run( self.value_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # 進んだlocal step数を返す
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
예제 #8
0
def visualize(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip,
              agent_type, action_size, rand_seed, checkpoint_dir):

    # use CPU for weight visualize tool
    device = "/cpu:0"

    if agent_type == 'LSTM':
        global_network = GameACLSTMNetwork(action_size, -1, device)
    else:
        global_network = GameACFFNetwork(action_size, -1, device)

    training_threads = []

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=rmsp_alpha,
                                  momentum=0.0,
                                  epsilon=rmsp_epsilon,
                                  clip_norm=grad_norm_clip,
                                  device=device)

    game = GameState(rand_seed, action_size)
    game.process(0)
    x_t = game.x_t

    plt.imshow(x_t, interpolation="nearest", cmap=plt.cm.gray)

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    W_conv1 = sess.run(global_network.W_conv1)

    # show graph of W_conv1
    fig, axes = plt.subplots(4,
                             16,
                             figsize=(12, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(4 * 16)):
        inch = i // 16
        outch = i % 16
        img = W_conv1[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()

    W_conv2 = sess.run(global_network.W_conv2)

    # show graph of W_conv2
    fig, axes = plt.subplots(2,
                             32,
                             figsize=(27, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(2 * 32)):
        inch = i // 32
        outch = i % 32
        img = W_conv2[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()

    arr = sess.run(global_network.get_vars())

    s = tf.placeholder("float", [None, 84, 84, 4])

    b_conv1 = sess.run(global_network.b_conv1)
    b_conv2 = sess.run(global_network.b_conv2)

    inp_1 = tf.nn.conv2d(s, W_conv1, strides=[1, 4, 4, 1], padding="VALID")
    h_conv1 = tf.nn.relu(inp_1 + b_conv1)

    inp_2 = tf.nn.conv2d(h_conv1,
                         W_conv2,
                         strides=[1, 2, 2, 1],
                         padding="VALID")
    h_conv2 = tf.nn.relu(inp_2 + b_conv2)

    s_t = game.s_t

    getActivations(sess, s, h_conv1, s_t, 16)
    getActivations(sess, s, h_conv2, s_t, 32)
예제 #9
0
if __name__ == "__main__":
  if len(sys.argv) != 2:
    print ("Usage %s <checkpoint-name>" % sys.argv[0])

  else:
    # use CPU for display tool
    device = "/cpu:0"

    global_network = MasterNetwork(Constants.NUM_ACTIONS, device)

    learning_rate_input = tf.placeholder("float")

    grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                                  decay = Constants.RMSP.ALPHA,
                                  momentum = 0.0,
                                  epsilon = Constants.RMSP.EPSILON,
                                  clip_norm = Constants.RMSP.GRADIENT_NORM_CLIP,
                                  device = device)

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(sys.argv[1])
    if checkpoint and checkpoint.model_checkpoint_path:
      saver.restore(sess, checkpoint.model_checkpoint_path)
      print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
      print("Could not find old checkpoint")
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 max_global_time_step):

        self.thread_index = thread_index
        self.learning_rate_input = tf.placeholder("float")
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE)
        self.local_network.prepare_loss(ENTROPY_BETA)

        # policy
        self.policy_trainer = AccumTrainer()
        self.policy_trainer.prepare_minimize(
            self.local_network.policy_loss,
            self.local_network.get_policy_vars())
        self.policy_accum_gradients = self.policy_trainer.accumulate_gradients(
        )
        self.policy_reset_gradients = self.policy_trainer.reset_gradients()

        self.policy_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.policy_apply_gradients = self.policy_applier.apply_gradients(
            global_network.get_policy_vars(),
            self.policy_trainer.get_accum_grad_list())

        # value
        self.value_trainer = AccumTrainer()
        self.value_trainer.prepare_minimize(
            self.local_network.value_loss, self.local_network.get_value_vars())
        self.value_accum_gradients = self.value_trainer.accumulate_gradients()
        self.value_reset_gradients = self.value_trainer.reset_gradients()

        self.value_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.value_apply_gradients = self.value_applier.apply_gradients(
            global_network.get_value_vars(),
            self.value_trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # thread0 will record score for TensorBoard
        if self.thread_index == 0:
            self.score_input = tf.placeholder(tf.int32)
            tf.scalar_summary("score", self.score_input)

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        #fail safe
        return len(values) - 1

    def _record_score(self, sess, summary_writer, summary_op, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={self.score_input: score})
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # 加算された勾配をリセット
        sess.run(self.policy_reset_gradients)
        sess.run(self.value_reset_gradients)

        # shared から localにweightをコピー
        sess.run(self.sync)

        start_local_t = self.local_t

        # 5回ループ
        for i in range(LOCAL_T_MAX):
            pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            action = self.choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            value_ = self.local_network.run_value(sess, self.game_state.s_t)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print "pi=", pi_
                print " V=", value_

            # gameを実行
            self.game_state.process(action)

            # 実行した結果
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            self.game_state.update()

            if terminal:
                terminal_end = True
                print "score=", self.episode_reward

                if self.thread_index == 0:
                    self._record_score(sess, summary_writer, summary_op,
                                       self.episode_reward, global_t)

                self.episode_reward = 0
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # 勾配を算出して加算していく
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            sess.run(self.policy_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.a: [a],
                         self.local_network.td: [td]
                     })

            sess.run(self.value_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.r: [R]
                     })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.policy_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})
        sess.run(self.value_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print "TIMESTEP", self.local_t

        # 進んだlocal step数を返す
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
예제 #11
0
파일: main.py 프로젝트: paulgowdy/doom_rl
    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"

        initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                            flags.initial_alpha_high,
                                            flags.initial_alpha_log_rate)

        self.global_t = 0

        self.reward_collector = []

        self.stop_requested = False
        self.terminate_reqested = False

        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)

        self.global_network = UnrealModel(action_size, -1,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          flags.pixel_change_lambda,
                                          flags.entropy_beta, device)
        self.trainers = []

        learning_rate_input = tf.placeholder("float")

        grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                      decay=flags.rmsp_alpha,
                                      momentum=0.0,
                                      epsilon=flags.rmsp_epsilon,
                                      clip_norm=flags.grad_norm_clip,
                                      device=device)

        for i in range(flags.parallel_size):
            print('building trainer', i)
            trainer = Trainer(
                i, self.global_network, initial_learning_rate,
                learning_rate_input, grad_applier, flags.env_type,
                flags.env_name, flags.use_pixel_change, flags.use_value_replay,
                flags.use_reward_prediction, flags.pixel_change_lambda,
                flags.entropy_beta, flags.local_t_max, flags.gamma,
                flags.gamma_pc, flags.experience_history_size,
                flags.max_time_step, device, self.reward_collector)
            self.trainers.append(trainer)
            print('')

        # prepare session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        # summary for tensorboard
        self.score_input = tf.placeholder(tf.int32)
        tf.summary.scalar("score", self.score_input)

        self.summary_op = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(flags.log_file,
                                                    self.sess.graph)

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        # Loading script

        checkpoint = tf.train.get_checkpoint_state(flags.load_dir)

        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded:", checkpoint.model_checkpoint_path)
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: ", self.global_t)
            # set wall time
            wall_t_fname = flags.load_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step

        else:
            print("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        # run training threads
        self.train_threads = []
        for i in range(flags.parallel_size):
            self.train_threads.append(
                threading.Thread(target=self.train_function, args=(i, True)))

        #signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t

        for t in self.train_threads:
            t.start()
예제 #12
0
def aa_train_main(args):
    ckpt_dir = args.ckptdir
    ckpt_prefix = args.ckptprefix
    device = args.device
    pyosr.init()
    dpy = pyosr.create_display()
    glctx = pyosr.create_gl_context(dpy)
    g = tf.Graph()
    util.mkdir_p(ckpt_dir)
    with g.as_default():
        learning_rate_input = tf.placeholder(tf.float32)
        grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                      decay=RMSP_ALPHA,
                                      momentum=0.0,
                                      epsilon=RMSP_EPSILON,
                                      clip_norm=GRAD_NORM_CLIP,
                                      device=device)
        masterdriver = rldriver.RLDriver(MODELS,
                                         init_state,
                                         view_config,
                                         config.SV_VISCFG,
                                         config.MV_VISCFG,
                                         output_number=AA_OUTPUT_NUMBER,
                                         use_rgb=True,
                                         continuous_policy_loss=True)
        driver = rldriver.RLDriver(MODELS,
                                   init_state,
                                   view_config,
                                   config.SV_VISCFG,
                                   config.MV_VISCFG,
                                   output_number=AA_OUTPUT_NUMBER,
                                   use_rgb=True,
                                   master_driver=masterdriver,
                                   grads_applier=grad_applier,
                                   continuous_policy_loss=True)
        driver.get_sync_from_master_op()
        driver.get_apply_grads_op()
        driver.learning_rate_input = learning_rate_input
        driver.a3c_local_t = 32
        global_step = tf.contrib.framework.get_or_create_global_step()
        increment_global_step = tf.assign_add(global_step,
                                              1,
                                              name='increment_global_step')
        saver = tf.train.Saver(masterdriver.get_nn_args() + [global_step])
        last_time = time.time()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(checkpoint_dir=ckpt_dir)
            print('ckpt {}'.format(ckpt))
            epoch = 0
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                epoch = sess.run(global_step)
                print('Restored!, global_step {}'.format(epoch))
            while epoch < args.iter:
                fn = "{}/{}{:06d}.npz".format(args.path, args.prefix,
                                              epoch % args.gtnumber)
                dic = np.load(fn)
                driver.train_from_gt(sess, dic['KEYS'], dic['TR'], dic['ROT'],
                                     dic['DIST'])
                epoch += 1
                sess.run(increment_global_step)
                if epoch % 1000 == 0 or time.time() - last_time >= 10 * 60:
                    print("Saving checkpoint")
                    fn = saver.save(sess,
                                    ckpt_dir + ckpt_prefix,
                                    global_step=global_step)
                    print("Saved checkpoint to {}".format(fn))
                    last_time = time.time()
                print("Epoch {}".format(epoch))
예제 #13
0
        values.append(value)

    r = random.random() * sum
    for i in range(len(values)):
        if values[i] >= r:
            return i
    #fail safe
    return len(values) - 1


global_network = GameACNetwork(ACTION_SIZE)

learning_rate_input = tf.placeholder("float")

policy_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                decay=0.99,
                                momentum=0.0,
                                epsilon=RMSP_EPSILON)

value_applier = RMSPropApplier(learning_rate=learning_rate_input,
                               decay=0.99,
                               momentum=0.0,
                               epsilon=RMSP_EPSILON)

training_threads = []
for i in range(PARALLEL_SIZE):
    training_thread = A3CTrainingThread(i, global_network, 1.0,
                                        learning_rate_input, policy_applier,
                                        value_applier, 8000000)
    training_threads.append(training_thread)

sess = tf.Session()
예제 #14
0
파일: train.py 프로젝트: wu6u3/async_ppo
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    '''
    '''
    ##################
    #  shared policy #
    ##################

    tic = time.clock()

    manarger = MPManager()
    manarger.start()

    shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name)
    shared_obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    shared_logger = Logger(logname=env_name, now=now + "-Master")
    shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master")
    #env = wrappers.Monitor(env, aigym_path, force=True)
    shared_scaler = Scaler(shared_obs_dim)

    shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None)
    shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult,
                           policy_logvar, -1, None)

    learning_rate_input = tf.placeholder("float")
    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=RMSP_ALPHA,
                                  momentum=0.0,
                                  epsilon=RMSP_EPSILON,
                                  clip_norm=GRAD_NORM_CLIP,
                                  device=device)

    # lacal policy declair
    env_a = [None] * N_WORKERS
    obs_dim_a = [None] * N_WORKERS
    act_dim_a = [None] * N_WORKERS
    logger_a = [None] * N_WORKERS
    aigym_path_a = [None] * N_WORKERS
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    val_func_a = [None] * N_WORKERS
    policy_a = [None] * N_WORKERS
    scaler_a = [None] * N_WORKERS
    for i in range(N_WORKERS):
        env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name)
        obs_dim_a[
            i] += 1  # add 1 to obs dimension for time step feature (see run_episode())
        logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i))
        aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i))
        #env_a[i] = wrappers.Monitor(env, aigym_path, force=True)
        scaler_a[i] = Scaler(obs_dim_a[i])

        val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i,
                                        shared_val_func)
        val_func_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_val_func.get_vars(), val_func_a[i].gradients)

        policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult,
                             policy_logvar, i, shared_policy)
        policy_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_policy.get_vars(), policy_a[i].gradients)

    # init tensorflow
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            allow_soft_placement=True))
    init = tf.global_variables_initializer()

    ## start sess
    sess.run(init)

    ## init shared scalar policy
    run_policy(sess,
               shared_env,
               shared_policy,
               shared_scaler,
               shared_logger,
               episodes=5)

    def single_work(thread_idx):
        """ training loop

        Args:
            env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
            num_episodes: maximum number of episodes to run
            gamma: reward discount factor (float)
            lam: lambda from Generalized Advantage Estimate
            kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
            batch_size: number of episodes per policy training batch
            hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
            policy_logvar: natural log of initial policy variance
        """
        env = env_a[thread_idx]
        policy = policy_a[thread_idx]
        #obs_dim = obs_dim_a[thread_idx]
        #act_dim = act_dim_a[thread_idx]
        logger = logger_a[thread_idx]
        aigym_path = aigym_path_a[thread_idx]
        scaler = scaler_a[thread_idx]
        val_func = val_func_a[thread_idx]

        print("=== start thread " + str(policy.get_thread_idx()) + " " +
              policy.get_scope() + " ===")
        print(shared_policy.get_vars())
        print(policy.get_vars())

        # run a few episodes of untrained policy to initialize scaler:
        #run_policy(sess, env, policy, scaler, logger, episodes=5)

        #policy.sync(shared_policy)
        #val_func.sync(shared_val_func)
        episode = 0

        while episode < num_episodes:

            ## copy global var into local
            sess.run(policy.sync)
            sess.run(val_func.sync)

            ## compute new model on local policy
            trajectories = run_policy(sess,
                                      env,
                                      policy,
                                      scaler,
                                      logger,
                                      episodes=batch_size)
            episode += len(trajectories)
            add_value(sess, trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode,
                            time.clock() - tic)

            policy.update(sess, observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(sess, observes, disc_sum_rew,
                         logger)  # update value function

            #cur_learning_rate = self._anneal_learning_rate(global_t)
            feed_dict = {
                policy.old_log_vars_ph: policy.old_log_vars_np,
                policy.old_means_ph: policy.old_means_np,
                policy.obs_ph: observes,
                policy.act_ph: actions,
                policy.advantages_ph: advantages,
                policy.beta_ph: policy.beta,
                policy.lr_ph: policy.lr,
                policy.eta_ph: policy.eta,
                learning_rate_input: policy.lr
            }

            sess.run(policy.apply_gradients, feed_dict)

            shared_policy.update(sess, observes, actions, advantages,
                                 shared_logger)

            feed_dict = {
                val_func.obs_ph: observes,
                val_func.val_ph: disc_sum_rew,
                learning_rate_input: val_func.lr
            }

            sess.run(val_func.apply_gradients, feed_dict)

            shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger)

            shared_logger.log({'_Time': time.clock() - tic})

            logger.write(
                display=True)  # write logger results to file and stdout

        logger.close()

    ## end def single work

    train_threads = []
    for i in range(N_WORKERS):
        train_threads.append(threading.Thread(target=single_work, args=(i, )))

    [t.start() for t in train_threads]
    [t.join() for t in train_threads]

    saver = tf.train.Saver()
    for i in range(N_WORKERS):
        logger_a[i].close()

    #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint')
    #saver.save(sess, path )

    sess.close()
def train():
    #initial learning rate
    initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW,
                                        INITIAL_ALPHA_HIGH,
                                        INITIAL_ALPHA_LOG_RATE)

    # parameter server and worker information
    ps_hosts = np.zeros(FLAGS.ps_hosts_num,dtype=object);
    worker_hosts = np.zeros(FLAGS.worker_hosts_num,dtype=object);
    port_num=FLAGS.st_port_num;
    for i in range(FLAGS.ps_hosts_num):
        ps_hosts[i]=str(FLAGS.hostname)+":"+str(port_num);
        port_num+=1;
    for i in range(FLAGS.worker_hosts_num):
        worker_hosts[i]=str(FLAGS.hostname)+":"+str(port_num);
        port_num+=1;
    ps_hosts=list(ps_hosts);
    worker_hosts=list(worker_hosts);
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)


    if FLAGS.job_name == "ps":
        server.join();
    elif FLAGS.job_name == "worker":
        # gpu_assignment = FLAGS.task_index % NUM_GPUS
        # print("Assigning worker #%d to GPU #%d" % (FLAGS.task_index, gpu_assignment))
        # device=tf.train.replica_device_setter(
        #             worker_device="/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu_assignment),
        #             cluster=cluster);

        device=tf.train.replica_device_setter(
              worker_device="/job:worker/task:%d" % FLAGS.task_index,
              cluster=cluster);



        learning_rate_input = tf.placeholder("float")

        grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                                                                    decay = RMSP_ALPHA,
                                                                    momentum = 0.0,
                                                                    epsilon = RMSP_EPSILON,
                                                                    clip_norm = GRAD_NORM_CLIP,
                                                                    device = device)

        tf.set_random_seed(1);
        #There are no global network

        #lock = multiprocessing.Lock()

        #wrapper = ToDiscrete('constant-7')
        #env = wrapper(gym.make('gym_doom/DoomBasic-v0'))
        #env.close()

        training_thread = A3CTrainingThread(0,"",0,initial_learning_rate,learning_rate_input,grad_applier,MAX_TIME_STEP,device=device,FLAGS=FLAGS,task_index=FLAGS.task_index)

        # prepare session
        with tf.device(device):
            # flag for task
            flag = tf.get_variable('flag',[],initializer=tf.constant_initializer(0),trainable=False);
            flag_ph=tf.placeholder(flag.dtype,shape=flag.get_shape());
            flag_ops=flag.assign(flag_ph);
            # global step
            global_step = tf.get_variable('global_step',[],initializer=tf.constant_initializer(0),trainable=False);
            global_step_ph=tf.placeholder(global_step.dtype,shape=global_step.get_shape());
            global_step_ops=global_step.assign(global_step_ph);
            # score for tensorboard and score_set for genetic algorithm
            score = tf.get_variable('score',[],initializer=tf.constant_initializer(-21),trainable=False);
            score_ph=tf.placeholder(score.dtype,shape=score.get_shape());
            score_ops=score.assign(score_ph);
            score_set=np.zeros(FLAGS.worker_hosts_num,dtype=object);
            score_set_ph=np.zeros(FLAGS.worker_hosts_num,dtype=object);
            score_set_ops=np.zeros(FLAGS.worker_hosts_num,dtype=object);
            for i in range(FLAGS.worker_hosts_num):
                score_set[i] = tf.get_variable('score'+str(i),[],initializer=tf.constant_initializer(-1000),trainable=False);
                score_set_ph[i]=tf.placeholder(score_set[i].dtype,shape=score_set[i].get_shape());
                score_set_ops[i]=score_set[i].assign(score_set_ph[i]);
            # fixed path of earlier task
            fixed_path_tf=np.zeros((FLAGS.L,FLAGS.M),dtype=object);
            fixed_path_ph=np.zeros((FLAGS.L,FLAGS.M),dtype=object);
            fixed_path_ops=np.zeros((FLAGS.L,FLAGS.M),dtype=object);
            for i in range(FLAGS.L):
                for j in range(FLAGS.M):
                    fixed_path_tf[i,j]=tf.get_variable('fixed_path'+str(i)+"-"+str(j),[],initializer=tf.constant_initializer(0),trainable=False);
                    fixed_path_ph[i,j]=tf.placeholder(fixed_path_tf[i,j].dtype,shape=fixed_path_tf[i,j].get_shape());
                    fixed_path_ops[i,j]=fixed_path_tf[i,j].assign(fixed_path_ph[i,j]);
            # parameters on PathNet
            vars_=training_thread.local_network.get_vars();
            vars_ph=np.zeros(len(vars_),dtype=object);
            vars_ops=np.zeros(len(vars_),dtype=object);
            for i in range(len(vars_)):
                vars_ph[i]=tf.placeholder(vars_[i].dtype,shape=vars_[i].get_shape());
                vars_ops[i]=vars_[i].assign(vars_ph[i]);
            # initialization
            init_op=tf.global_variables_initializer();
            # summary for tensorboard
            tf.summary.scalar("score", score);
            summary_op = tf.summary.merge_all()
            saver = tf.train.Saver();

        sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                                 global_step=global_step,
                                 logdir=FLAGS.log_dir,
                                 summary_op=summary_op,
                                 saver=saver,
                                 init_op=init_op)
        try:
            os.mkdir("./data/graphs")
        except:
            pass

        # config = tf.ConfigProto(
        #         device_count = {'GPU': 0}
        #     )
        # config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        # config.gpu_options.per_process_gpu_memory_fraction = 0.1

        with sv.managed_session(server.target) as sess:
            if(FLAGS.task_index!=(FLAGS.worker_hosts_num-1)):
                 for task in range(2):
                    training_thread.set_training_stage(task)

                    while sess.run([flag])[0] != (task+1):
                        time.sleep(2)

                    # Set fixed_path
                    fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float);
                    for i in range(FLAGS.L):
                        for j in range(FLAGS.M):
                            if(sess.run([fixed_path_tf[i,j]])[0]==1):
                                fixed_path[i,j]=1.0;
                    training_thread.local_network.set_fixed_path(fixed_path);
                    # set start_time
                    wall_t=0.0;
                    start_time = time.time() - wall_t
                    training_thread.set_start_time(start_time)
                    while True:
                        if sess.run([global_step])[0] > (MAX_TIME_STEP*(task+1)):
                            break
                        diff_global_t = training_thread.process(sess, sess.run([global_step])[0], "",
                                                                                                    summary_op, "",score_ph,score_ops,"",FLAGS,score_set_ph[FLAGS.task_index],score_set_ops[FLAGS.task_index])
                        sess.run(global_step_ops,{global_step_ph:sess.run([global_step])[0]+diff_global_t});
            else:
                fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float)
                vars_backup=np.zeros(len(vars_),dtype=object)
                vars_backup=sess.run(vars_)
                winner_idx=0

                vis = visualize.GraphVisualize([FLAGS.M] * FLAGS.L, True)


                for task in range(2):
                    # Generating randomly geopath
                    geopath_set=np.zeros(FLAGS.worker_hosts_num-1,dtype=object);
                    for i in range(FLAGS.worker_hosts_num-1):
                        geopath_set[i]=pathnet.get_geopath(FLAGS.L,FLAGS.M,FLAGS.N);
                        tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float);
                        for j in range(FLAGS.L):
                            for k in range(FLAGS.M):
                                if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)):
                                    tmp[j,k]=1.0;
                        pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M);
                    print("Geopath Setting Done");
                    sess.run(flag_ops,{flag_ph:(task+1)});
                    print("=============Task "+str(task+1)+"============");
                    score_subset=np.zeros(FLAGS.B,dtype=float);
                    score_set_print=np.zeros(FLAGS.worker_hosts_num,dtype=float);
                    rand_idx=np.arange(FLAGS.worker_hosts_num-1);
                    np.random.shuffle(rand_idx);
                    rand_idx=rand_idx[:FLAGS.B];
                    while sess.run([global_step])[0] <= (MAX_TIME_STEP*(task+1)):
                        # if (sess.run([global_step])[0]) % 1000 == 0:
                        #     print("Saving summary...")
                        #     tf.logging.info('Running Summary operation on the chief.')
                        #     summary_str = sess.run(summary_op)
                        #     sv.summary_computed(sess, summary_str)
                        #     tf.logging.info('Finished running Summary operation.')
                        #
                        #     # Determine the next time for running the summary.


                        decodePath = lambda p: [np.where(l==1.0)[0] for l in p]

                        flag_sum=0;
                        for i in range(FLAGS.worker_hosts_num-1):
                            score_set_print[i]=sess.run([score_set[i]])[0];
                        for i in range(len(rand_idx)):
                            score_subset[i]=sess.run([score_set[rand_idx[i]]])[0];
                            if(score_subset[i]==-1000):
                                flag_sum=1;
                                break;
                        if(flag_sum==0):
                            vispaths = [np.array(decodePath(p)) for p in geopath_set]
                            vis.show(vispaths, 'm')

                            winner_idx=rand_idx[np.argmax(score_subset)];
                            print(str(sess.run([global_step])[0])+" Step Score: "+str(sess.run([score_set[winner_idx]])[0]));
                            for i in rand_idx:
                                if(i!=winner_idx):
                                    geopath_set[i]=np.copy(geopath_set[winner_idx]);
                                    geopath_set[i]=pathnet.mutation(geopath_set[i],FLAGS.L,FLAGS.M,FLAGS.N);
                                    tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float);
                                    for j in range(FLAGS.L):
                                        for k in range(FLAGS.M):
                                            if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)):
                                                tmp[j,k]=1.0;
                                    pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M);
                                sess.run(score_set_ops[i],{score_set_ph[i]:-1000})
                            rand_idx=np.arange(FLAGS.worker_hosts_num-1)
                            np.random.shuffle(rand_idx)
                            rand_idx=rand_idx[:FLAGS.B]
                        else:
                            time.sleep(2);
                    # fixed_path setting
                    fixed_path=geopath_set[winner_idx]

                    vis.set_fixed(decodePath(fixed_path), 'r' if task == 0 else 'g')
                    vis.show(vispaths, 'm')
                    print('fix')
                    for i in range(FLAGS.L):
                        for j in range(FLAGS.M):
                            if(fixed_path[i,j]==1.0):
                                sess.run(fixed_path_ops[i,j],{fixed_path_ph[i,j]:1});
                    training_thread.local_network.set_fixed_path(fixed_path);

                    # backup fixed vars
                    # FIXED_VARS_BACKUP = training_thread.local_network.get_fixed_vars();
                    # FIXED_VARS_IDX_BACKUP = training_thread.local_network.get_fixed_vars_idx();

                    # initialization of parameters except fixed_path
                    vars_idx=training_thread.local_network.get_vars_idx();
                    for i in range(len(vars_idx)):
                        if(vars_idx[i]==1.0):
                            sess.run(vars_ops[i],{vars_ph[i]:vars_backup[i]});

                vis.waitForButtonPress()
        sv.stop();
예제 #16
0
def train():
  #initial learning rate
  initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW,
                                      INITIAL_ALPHA_HIGH,
                                      INITIAL_ALPHA_LOG_RATE)

  # parameter server and worker information
  ps_hosts = np.zeros(FLAGS.ps_hosts_num,dtype=object);
  worker_hosts = np.zeros(FLAGS.worker_hosts_num,dtype=object);
  port_num=FLAGS.st_port_num;
  for i in range(FLAGS.ps_hosts_num):
    ps_hosts[i]=str(FLAGS.hostname)+":"+str(port_num);
    port_num+=1;
  for i in range(FLAGS.worker_hosts_num):
    worker_hosts[i]=str(FLAGS.hostname)+":"+str(port_num);
    port_num+=1;
  ps_hosts=list(ps_hosts);
  worker_hosts=list(worker_hosts);
  # Create a cluster from the parameter server and worker hosts.
  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
  
  # Create and start a server for the local task.
  server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)
  
  
  if FLAGS.job_name == "ps":
    server.join();
  elif FLAGS.job_name == "worker":
    device=tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % FLAGS.task_index,
          cluster=cluster);
    
    learning_rate_input = tf.placeholder("float")
    
    grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                                  decay = RMSP_ALPHA,
                                  momentum = 0.0,
                                  epsilon = RMSP_EPSILON,
                                  clip_norm = GRAD_NORM_CLIP,
                                  device = device)
    
    tf.set_random_seed(1);
    #There are no global network
    training_thread = A3CTrainingThread(0, "", initial_learning_rate,
                                          learning_rate_input,
                                          grad_applier, MAX_TIME_STEP,
                                          device = device,FLAGS=FLAGS,task_index=FLAGS.task_index)
    
    # prepare session
    with tf.device(tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % FLAGS.task_index,
          cluster=cluster)):
      # flag for task
      flag = tf.get_variable('flag',[],initializer=tf.constant_initializer(0),trainable=False);
      flag_ph=tf.placeholder(flag.dtype,shape=flag.get_shape());
      flag_ops=flag.assign(flag_ph);
      # global step
      global_step = tf.get_variable('global_step',[],initializer=tf.constant_initializer(0),trainable=False);
      global_step_ph=tf.placeholder(global_step.dtype,shape=global_step.get_shape());
      global_step_ops=global_step.assign(global_step_ph);
      # score for tensorboard and score_set for genetic algorithm
      score = tf.get_variable('score',[],initializer=tf.constant_initializer(-21),trainable=False);
      score_ph=tf.placeholder(score.dtype,shape=score.get_shape());
      score_ops=score.assign(score_ph);
      score_set=np.zeros(FLAGS.worker_hosts_num,dtype=object);
      score_set_ph=np.zeros(FLAGS.worker_hosts_num,dtype=object);
      score_set_ops=np.zeros(FLAGS.worker_hosts_num,dtype=object);
      for i in range(FLAGS.worker_hosts_num):
        score_set[i] = tf.get_variable('score'+str(i),[],initializer=tf.constant_initializer(-1000),trainable=False);
        score_set_ph[i]=tf.placeholder(score_set[i].dtype,shape=score_set[i].get_shape());
        score_set_ops[i]=score_set[i].assign(score_set_ph[i]);
      # fixed path of earlier task
      fixed_path_tf=np.zeros((FLAGS.L,FLAGS.M),dtype=object);
      fixed_path_ph=np.zeros((FLAGS.L,FLAGS.M),dtype=object);
      fixed_path_ops=np.zeros((FLAGS.L,FLAGS.M),dtype=object);
      for i in range(FLAGS.L):
        for j in range(FLAGS.M):
          fixed_path_tf[i,j]=tf.get_variable('fixed_path'+str(i)+"-"+str(j),[],initializer=tf.constant_initializer(0),trainable=False);
          fixed_path_ph[i,j]=tf.placeholder(fixed_path_tf[i,j].dtype,shape=fixed_path_tf[i,j].get_shape());
          fixed_path_ops[i,j]=fixed_path_tf[i,j].assign(fixed_path_ph[i,j]);
      # parameters on PathNet
      vars_=training_thread.local_network.get_vars();
      vars_ph=np.zeros(len(vars_),dtype=object);
      vars_ops=np.zeros(len(vars_),dtype=object);
      for i in range(len(vars_)):
        vars_ph[i]=tf.placeholder(vars_[i].dtype,shape=vars_[i].get_shape());
        vars_ops[i]=vars_[i].assign(vars_ph[i]);
      
      # initialization
      init_op=tf.global_variables_initializer();
      # summary for tensorboard
      tf.summary.scalar("score", score);
      summary_op = tf.summary.merge_all()
      saver = tf.train.Saver();
    
    sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                                   global_step=global_step,
                                   logdir=FLAGS.log_dir,
                                   summary_op=summary_op,
                                   saver=saver,
                                   init_op=init_op)
    
    with sv.managed_session(server.target) as sess:
      if(FLAGS.task_index!=(FLAGS.worker_hosts_num-1)):
        for task in range(2):
          while True:
            if(sess.run([flag])[0]==(task+1)):
              break;
              time.sleep(2);
          # Set fixed_path
          fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float);
          for i in range(FLAGS.L):
            for j in range(FLAGS.M):
              if(sess.run([fixed_path_tf[i,j]])[0]==1):
                fixed_path[i,j]=1.0;
          training_thread.local_network.set_fixed_path(fixed_path);
          # set start_time
          wall_t=0.0;
          start_time = time.time() - wall_t
          training_thread.set_start_time(start_time)
          while True:
            if sess.run([global_step])[0] > (MAX_TIME_STEP*(task+1)):
              break
            diff_global_t = training_thread.process(sess, sess.run([global_step])[0], "",
                                                  summary_op, "",score_ph,score_ops,"",FLAGS,score_set_ph[FLAGS.task_index],score_set_ops[FLAGS.task_index])
            sess.run(global_step_ops,{global_step_ph:sess.run([global_step])[0]+diff_global_t});
      else:
        fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float);
        vars_backup=np.zeros(len(vars_),dtype=object);
        vars_backup=sess.run(vars_);
        winner_idx=0;
        for task in range(2):
          # Generating randomly geopath
          geopath_set=np.zeros(FLAGS.worker_hosts_num-1,dtype=object);
          for i in range(FLAGS.worker_hosts_num-1):
            geopath_set[i]=pathnet.get_geopath(FLAGS.L,FLAGS.M,FLAGS.N);
            tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float);
            for j in range(FLAGS.L):
              for k in range(FLAGS.M):
                if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)):
                  tmp[j,k]=1.0;
            pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M);
          print("Geopath Setting Done");
          sess.run(flag_ops,{flag_ph:(task+1)});
          print("=============Task"+str(task+1)+"============");
          score_subset=np.zeros(FLAGS.B,dtype=float);
          score_set_print=np.zeros(FLAGS.worker_hosts_num,dtype=float);
          rand_idx=range(FLAGS.worker_hosts_num-1); np.random.shuffle(rand_idx);
          rand_idx=rand_idx[:FLAGS.B];
          while True:
            if sess.run([global_step])[0] > (MAX_TIME_STEP*(task+1)):
              break
            flag_sum=0;
            for i in range(FLAGS.worker_hosts_num-1):
              score_set_print[i]=sess.run([score_set[i]])[0];
            print(score_set_print);
            for i in range(len(rand_idx)):
              score_subset[i]=sess.run([score_set[rand_idx[i]]])[0];
              if(score_subset[i]==-1000):
                flag_sum=1;
                break;
            if(flag_sum==0):
              winner_idx=rand_idx[np.argmax(score_subset)];
              print(str(sess.run([global_step])[0])+" Step Score: "+str(sess.run([score_set[winner_idx]])[0]));
              for i in rand_idx:
                if(i!=winner_idx):
                  geopath_set[i]=np.copy(geopath_set[winner_idx]);
                  geopath_set[i]=pathnet.mutation(geopath_set[i],FLAGS.L,FLAGS.M,FLAGS.N);
                  tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float);
                  for j in range(FLAGS.L):
                    for k in range(FLAGS.M):
                      if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)):
                        tmp[j,k]=1.0;
                  pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M);
                sess.run(score_set_ops[i],{score_set_ph[i]:-1000});
              rand_idx=range(FLAGS.worker_hosts_num-1); np.random.shuffle(rand_idx);
              rand_idx=rand_idx[:FLAGS.B];
            else:
              time.sleep(5);
          # fixed_path setting
          fixed_path=geopath_set[winner_idx];
          for i in range(FLAGS.L):
            for j in range(FLAGS.M):
              if(fixed_path[i,j]==1.0):
                sess.run(fixed_path_ops[i,j],{fixed_path_ph[i,j]:1});
          training_thread.local_network.set_fixed_path(fixed_path);
          # initialization of parameters except fixed_path
          vars_idx=training_thread.local_network.get_vars_idx();
          for i in range(len(vars_idx)):
            if(vars_idx[i]==1.0):
              sess.run(vars_ops[i],{vars_ph[i]:vars_backup[i]});
    sv.stop();
    print("Done");