Пример #1
0
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        pygame.display.set_caption('UNREAL')

        self.action_size = Environment.get_action_size(flags.env_type,
                                                       flags.env_name)
        self.objective_size = Environment.get_objective_size(
            flags.env_type, flags.env_name)
        self.global_network = UnrealModel(self.action_size,
                                          self.objective_size,
                                          -1,
                                          flags.use_lstm,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          0.0,
                                          0.0,
                                          "/cpu:0",
                                          for_display=True)
        self.environment = Environment.create_environment(
            flags.env_type,
            flags.env_name,
            env_args={
                'episode_schedule': flags.split,
                'log_action_trace': flags.log_action_trace,
                'max_states_per_scene': flags.episodes_per_scene,
                'episodes_per_scene_test': flags.episodes_per_scene
            })
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0
Пример #2
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               env_type,
               env_name,
               use_lstm,
               use_pixel_change,
               use_value_replay,
               use_reward_prediction,
               pixel_change_lambda,
               entropy_beta,
               local_t_max,
               gamma,
               gamma_pc,
               experience_history_size,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.env_type = env_type
    self.env_name = env_name
    self.use_lstm = use_lstm
    self.use_pixel_change = use_pixel_change
    self.use_value_replay = use_value_replay
    self.use_reward_prediction = use_reward_prediction
    self.local_t_max = local_t_max
    self.gamma = gamma
    self.gamma_pc = gamma_pc
    self.experience_history_size = experience_history_size
    self.max_global_time_step = max_global_time_step
    self.action_size = Environment.get_action_size(env_type, env_name)
    self.objective_size = Environment.get_objective_size(env_type, env_name)
    
    self.local_network = UnrealModel(self.action_size,
                                     self.objective_size,
                                     thread_index,
                                     use_lstm,
                                     use_pixel_change,
                                     use_value_replay,
                                     use_reward_prediction,
                                     pixel_change_lambda,
                                     entropy_beta,
                                     device)
    self.local_network.prepare_loss()

    self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss,
                                                       global_network.get_vars(),
                                                       self.local_network.get_vars())
    
    self.sync = self.local_network.sync_from(global_network)
    self.experience = Experience(self.experience_history_size)
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    # For log output
    self.prev_local_t = 0
Пример #3
0
 def __init__(self):
     self.action_size = Environment.get_action_size(flags.env_type,
                                                    flags.env_name)
     self.objective_size = Environment.get_objective_size(
         flags.env_type, flags.env_name)
     self.global_network = UnrealModel(self.action_size,
                                       self.objective_size,
                                       -1,
                                       flags.use_lstm,
                                       flags.use_pixel_change,
                                       flags.use_value_replay,
                                       flags.use_reward_prediction,
                                       0.0,
                                       0.0,
                                       "/cpu:0",
                                       for_display=True)
     self.environment = Environment.create_environment(
         flags.env_type,
         flags.env_name,
         env_args={
             'episode_schedule': flags.split,
             'log_action_trace': flags.log_action_trace,
             'seed': flags.seed,
             # 'max_states_per_scene': flags.episodes_per_scene,
             'episodes_per_scene_test': flags.episodes_per_scene
         })
     self.episode_reward = 0
     self.cnt_success = 0
Пример #4
0
 def __init__(self):
     self.action_size = Environment.get_action_size(flags.env_type,
                                                    flags.env_name)
     self.objective_size = Environment.get_objective_size(
         flags.env_type, flags.env_name)
     print('flags:use_pixel_change {}'.format(flags.use_pixel_change))
     sleep(10)
     self.global_network = UnrealModel(self.action_size,
                                       self.objective_size,
                                       -1,
                                       flags.use_lstm,
                                       flags.use_pixel_change,
                                       flags.use_value_replay,
                                       flags.use_reward_prediction,
                                       0.0,
                                       0.0,
                                       "/cpu:0",
                                       for_display=True)
     self.environment = Environment.create_environment(
         flags.env_type,
         flags.env_name,
         env_args={
             'episode_schedule': flags.split,
             'log_action_trace': flags.log_action_trace,
             'max_states_per_scene': flags.episodes_per_scene,
             'episodes_per_scene_test': flags.episodes_per_scene
         })
     print('\n======\nENV in Evaluate::ctor')
     print(self.environment)
     print(self.global_network)
     print('val_replay!!! {}'.format(flags.use_value_replay))
     print(flags.split)
     print('=======\n')
     sleep(10)
     self.episode_reward = 0
Пример #5
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 env_args, use_pixel_change, use_value_replay,
                 use_reward_prediction, pixel_change_lambda, entropy_beta,
                 local_t_max, gamma, gamma_pc, experience_history_size,
                 max_global_time_step, spatial_dim, optimizor):

        self.thread_index = thread_index
        self.env_args = env_args
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size()
        self.local_network = Agent(thread_index, use_pixel_change,
                                   use_value_replay, use_reward_prediction,
                                   pixel_change_lambda, entropy_beta)

        self.global_network = global_network
        self.experience = Experience(self.experience_history_size)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        self.spatial_dim = spatial_dim
        self.obs_processer = ObsProcesser()
        self.action_processer = ActionProcesser(dim=spatial_dim)
        self.optimizor = optimizor
        self.distribution = th.distributions.Categorical
        # For log output
        self.prev_local_t = 0
        self.environment = Environment.create_environment(self.env_args)
Пример #6
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.action_size = Environment.get_action_size()
    self.local_network = UnrealModel(self.action_size, thread_index, device)
    self.local_network.prepare_loss()

    self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss,
                                                       global_network.get_vars(),
                                                       self.local_network.get_vars())
    
    self.sync = self.local_network.sync_from(global_network)
    self.environment = Environment.create_environment()
    self.experience = Experience(EXPERIENCE_HISTORY_SIZE)
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    # For log output
    self.prev_local_t = 0
Пример #7
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 env_name, use_pixel_change, use_value_replay,
                 use_reward_prediction, pixel_change_lambda, entropy_beta,
                 local_t_max, gamma, gamma_pc, experience_history_size,
                 max_global_time_step, grad_norm_clip, optimizor, device):

        self.thread_index = thread_index
        self.env_name = env_name
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_name)
        self.local_network = Agent(thread_index, self.action_size,
                                   use_pixel_change, use_value_replay,
                                   use_reward_prediction, pixel_change_lambda,
                                   entropy_beta, device)

        self.global_network = global_network
        self.experience = Experience(self.experience_history_size)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        self.optimizor = optimizor
        self.distribution = torch.distributions.Categorical
        self.grad_norm_clip = grad_norm_clip
        self.prev_local_t = 0
Пример #8
0
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        name = 'UNREAL' if flags.segnet == 0 else "A3C ErfNet"
        pygame.display.set_caption(name)

        env_config = sim_config.get(flags.env_name)
        self.image_shape = [
            env_config.get('height', 88),
            env_config.get('width', 88)
        ]
        segnet_param_dict = {'segnet_mode': flags.segnet}
        is_training = tf.placeholder(tf.bool, name="training")
        map_file = env_config.get('objecttypes_file', '../../objectTypes.csv')
        self.label_mapping = pd.read_csv(map_file, sep=',', header=0)
        self.get_col_index()

        self.action_size = Environment.get_action_size(flags.env_type,
                                                       flags.env_name)
        self.objective_size = Environment.get_objective_size(
            flags.env_type, flags.env_name)
        self.global_network = UnrealModel(self.action_size,
                                          self.objective_size,
                                          -1,
                                          flags.use_lstm,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          0.0,
                                          0.0,
                                          "/gpu:0",
                                          segnet_param_dict=segnet_param_dict,
                                          image_shape=self.image_shape,
                                          is_training=is_training,
                                          n_classes=flags.n_classes,
                                          segnet_lambda=flags.segnet_lambda,
                                          dropout=flags.dropout,
                                          for_display=True)
        self.environment = Environment.create_environment(
            flags.env_type,
            flags.env_name,
            flags.termination_time_sec,
            env_args={
                'episode_schedule': flags.split,
                'log_action_trace': flags.log_action_trace,
                'max_states_per_scene': flags.episodes_per_scene,
                'episodes_per_scene_test': flags.episodes_per_scene
            })
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0
Пример #9
0
 def __init__(self):
     self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8)
     self.action_size = Environment.get_action_size()
     self.global_network = UnrealModel(self.action_size,
                                       -1,
                                       "/cpu:0",
                                       for_display=True)
     self.env = Environment.create_environment()
     self.value_history = ValueHistory()
     self.state_history = StateHistory()
     self.ep_reward = 0
     self.mazemap = MazeMap()
Пример #10
0
    def __init__(self,
               runner,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               env_type,
               env_name,
               entropy_beta,
               gamma,
               experience,
               max_global_time_step,
               device,
               value_lambda):
        self.runner = runner
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.gamma = gamma
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)
        self.obs_size = Environment.get_obs_size(env_type, env_name)
        self.global_network = global_network
        self.local_network = UnrealModel(self.action_size,
                                         self.obs_size,
                                         1,
                                         entropy_beta,
                                         device,
                                         value_lambda=value_lambda)

        self.local_network.prepare_loss()
        
        self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss,
                                                                    self.global_network.get_vars(),
                                                                     self.local_network.get_vars())
        self.sync = self.local_network.sync_from(self.global_network, name="base_trainer")
        self.experience = experience
        self.local_t = 0
        self.next_log_t = 0
        self.next_performance_t = PERFORMANCE_LOG_INTERVAL
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # trackers for the experience replay creation
        self.last_state = None
        self.last_action = 0
        self.last_reward = 0
        self.ep_ploss = 0.
        self.ep_vloss = 0.
        self.ep_entr = []
        self.ep_grad = []
        self.ep_l = 0
Пример #11
0
 def __init__(self, args, display_size, saver):
     pygame.init()
     self.args = args
     self.surface = pygame.display.set_mode(display_size, 0, 24)
     pygame.display.set_caption('UNREAL')
     args.action_size = Environment.get_action_size(args.env_name)
     self.global_network = Agent(1, args)
     saver.restore(self.global_network)
     self.global_network.eval()
     self.environment = Environment.create_environment(args.env_name)
     self.font = pygame.font.SysFont(None, 20)
     self.value_history = ValueHistory()
     self.state_history = StateHistory()
     self.distribution = torch.distributions.Categorical
     self.episode_reward = 0
    def test_step(self):
        environment = Environment.create_environment()
        action_size = Environment.get_action_size()

        if sys.platform == 'darwin':
            self.assertTrue(action_size == 6)
        else:
            self.assertTrue(action_size == 8)

        for i in range(3):
            self.assertTrue(environment.last_observation.shape == (84, 84))
            if SAVE_IMAGE:
                scipy.misc.imsave("debug_observation{0}.png".format(i),
                                  environment.last_observation)
            reward, terminal = environment.step(0)
Пример #13
0
def main(args):
    action_size = Environment.get_action_size(flags.env_type, flags.env_name)
    objective_size = Environment.get_objective_size(flags.env_type,
                                                    flags.env_name)
    global_network = UnrealModel(action_size, objective_size, -1,
                                 flags.use_lstm, flags.use_pixel_change,
                                 flags.use_value_replay,
                                 flags.use_reward_prediction, 0.0, 0.0,
                                 "/cpu:0")  # use CPU for weight visualize tool

    sess = tf.Session()

    init = tf.global_variables_initializer()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    vars = {}
    var_list = global_network.get_vars()
    for v in var_list:
        vars[v.name] = v

    W_conv1 = sess.run(vars['net_-1/base_conv/W_base_conv1:0'])

    # show graph of W_conv1
    fig, axes = plt.subplots(3,
                             16,
                             figsize=(12, 6),
                             subplot_kw={
                                 'xticks': [],
                                 'yticks': []
                             })
    fig.subplots_adjust(hspace=0.1, wspace=0.1)

    for ax, i in zip(axes.flat, range(3 * 16)):
        inch = i // 16
        outch = i % 16
        img = W_conv1[:, :, inch, outch]
        ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
        ax.set_title(str(inch) + "," + str(outch))

    plt.show()
Пример #14
0
  def __init__(self):
    self.action_size = Environment.get_action_size(flags.env_type, flags.env_name)
    self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name)

    env_config = sim_config.get(flags.env_name)
    self.image_shape = [env_config['height'], env_config['width']]
    segnet_param_dict = {'segnet_mode': flags.segnet}
    is_training = tf.placeholder(tf.bool, name="training") # for display param in UnrealModel says its value

    self.global_network = UnrealModel(self.action_size,
                                      self.objective_size,
                                      -1,
                                      flags.use_lstm,
                                      flags.use_pixel_change,
                                      flags.use_value_replay,
                                      flags.use_reward_prediction,
                                      0.0, #flags.pixel_change_lambda
                                      0.0, #flags.entropy_beta
                                      device,
                                      segnet_param_dict=segnet_param_dict,
                                      image_shape=self.image_shape,
                                      is_training=is_training,
                                      n_classes=flags.n_classes,
                                      segnet_lambda=flags.segnet_lambda,
                                      dropout=flags.dropout,
                                      for_display=True)
    self.environment = Environment.create_environment(flags.env_type, flags.env_name, flags.termination_time_sec,
                                                      env_args={'episode_schedule': flags.split,
                                                                'log_action_trace': flags.log_action_trace,
                                                                'max_states_per_scene': flags.episodes_per_scene,
                                                                'episodes_per_scene_test': flags.episodes_per_scene})

    self.global_network.prepare_loss()

    self.total_loss = []
    self.segm_loss = []
    self.episode_reward = [0]
    self.episode_roomtype = []
    self.roomType_dict  = {}
    self.segnet_class_dict = {}
    self.success_rate = []
    self.batch_size = 20
    self.batch_cur_num = 0
    self.batch_prev_num = 0
    self.batch_si = []
    self.batch_sobjT = []
    self.batch_a = []
    self.batch_reward = []
Пример #15
0
    def __init__(self, display_size):
        pygame.init()

        self.surface = pygame.display.set_mode(display_size, 0, 24)
        pygame.display.set_caption('UNREAL')

        self.action_size = Environment.get_action_size()
        self.global_network = UnrealModel(self.action_size,
                                          -1,
                                          "/cpu:0",
                                          for_display=True)
        self.environment = Environment.create_environment()
        self.font = pygame.font.SysFont(None, 20)
        self.value_history = ValueHistory()
        self.state_history = StateHistory()
        self.episode_reward = 0
Пример #16
0
    def check_environment(self, env_type, env_name):
        env = Environment.create_environment(env_type, env_name, 0)
        action_size = Environment.get_action_size(env_type, env_name)

        for i in range(3):
            state, reward, terminal = env.process(0)

            print(state)
            print(reward)
            print(terminal)
            # # Check shape
            # self.assertTrue(state.shape == (84, 84, 3))
            # # state and pixel_change value range should be [0,1]
            # self.assertTrue(np.amax(state) <= 1.0)

        env.stop()
Пример #17
0
def learn_flappyb():
    env = Environment(draw=DRAW, fps=1, debug=False,
                      dist_to_pipe=DIFFICULTY_LEARN,
                      dist_between_pipes=DIST_BETWEEN_PIPES,
                      obs_this_pipe=OBS_THIS_PIPE_LEARN)
    writer = None
    if WRITE:
        writer = SummaryWriter(comment=NAME)
    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    model = load_model('models/dqn/{}.h5'.format(LOAD_NAME))
    dqn_solver = DQNSolver(observation_space, action_space, model)
    run = 0

    if SAVE_MODEL:
        name = '{}-PART={}'.format(NAME, run)
        dqn_solver.model.save('models/dqn/{}.h5'.format(name))
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        reward_score = 0

        while True:
            step += 1
            action = dqn_solver.act(state, env)
            state_next, reward, terminal, info = env.step_buffer(action)
            reward_score += reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(reward_score))
                if WRITE:
                    writer.add_scalar("reward", reward_score, run)
                break
            dqn_solver.experience_replay()
        if (run % 100 == 0) and SAVE_MODEL:
            name = '{}-PART={}'.format(NAME, run)
            dqn_solver.model.save('models/dqn/{}.h5'.format(name))
    if WRITE:
        writer.close()
Пример #18
0
  def __init__(self, display_size,model):
    pygame.init()
    self.surface = pygame.display.set_mode(display_size, 0, 24)
    pygame.display.set_caption('MAPREADER')

    self.action_size = Environment.get_action_size()
    self.global_network = model
    self.environment = Environment.create_environment(*DISPLAY_LEVEL)
    self.font = pygame.font.SysFont(None, 20)
    self.value_history = ValueHistory()
    self.step_count = 0
    self.episode_reward = 0
    self.episode_intrinsic_reward = 0
    self.state = self.environment.last_state
    self.replan = True
    self.path = []
    self.maze_size = DISPLAY_LEVEL[0]//40*2+7
Пример #19
0
    def test_process(self):
        environment = Environment.create_environment()
        action_size = Environment.get_action_size()

        for i in range(3):
            state, reward, terminal, pixel_change = environment.process(0)

            # Check shape
            self.assertTrue(state.shape == (84, 84, 3))
            self.assertTrue(environment.last_state.shape == (84, 84, 3))
            self.assertTrue(pixel_change.shape == (20, 20))

            # state and pixel_change value range should be [0,1]
            self.assertTrue(np.amax(state) <= 1.0)
            self.assertTrue(np.amin(state) >= 0.0)
            self.assertTrue(np.amax(pixel_change) <= 1.0)
            self.assertTrue(np.amin(pixel_change) >= 0.0)
Пример #20
0
def get_prediction(history, action, env_name, check_dir):
    action_size = Environment.get_action_size(env_type, env_name)
    global_network = UnrealModel(
        action_size,
        -1,
        #flags.use_pixel_change,
        #flags.use_value_replay,
        #flags.use_reward_prediction,
        #flags.use_future_reward_prediction,
        #flags.use_autoencoder,
        False,
        False,
        False,
        True,
        True,
        .0,
        .0,
        "/cpu:0")

    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(check_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")

    feed_dict = {
        global_network.frp_input: np.zeros((4, 84, 84, 3)),
        global_network.frp_action_input: np.zeros((1, action_size))
    }
    encoder_output = sess.run(global_network.encoder_output, feed_dict)
    print(encoder_output)
Пример #21
0
def play_flappyb():
    env = Environment(draw=True, fps=1, debug=True,
                      dist_to_pipe=DIFFICULTY_PLAY,
                      dist_between_pipes=DIST_BETWEEN_PIPES,
                      obs_this_pipe=OBS_THIS_PIPE_PLAY)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME))
    dqn_solver = DQNSolver(observation_space, action_space, model)

    for i in range(20):
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        is_done = False
        while not is_done:
            action = dqn_solver.act_free(state)
            # action = env.get_action_random()
            state_next, reward, terminal, info = env.step_buffer(action)
            is_done = terminal
            state = np.reshape(state_next, [1, observation_space])
Пример #22
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.action_size = Environment.get_action_size()
        self.local_network = MapReaderModel(self.action_size, thread_index,
                                            device)
        self.local_network.prepare_loss()

        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.total_loss, global_network.get_vars(),
            self.local_network.get_vars())

        self.sync = self.local_network.sync_from(global_network)
        self.experience = Experience(EXPERIENCE_HISTORY_SIZE)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        self.maze_size = 5
        if self.thread_index in range(2):
            self.maze_size = 13
        elif self.thread_index in [2, 3]:
            self.maze_size = 11
        elif self.thread_index in [4, 5]:
            self.maze_size = 9
        elif self.thread_index in [6, 7]:
            self.maze_size = 7
        self.level_seed = np.random.randint(LEVEL_SET_SIZE)
        # For log output
        self.prev_local_t = 0
        self.last_terminal_local_t = 0
        self.steps_buffer = deque()
        self.correct_exits = 0
        self.running = True
Пример #23
0
    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"
        logger.debug("start App")
        initial_learning_rate = flags.initial_learning_rate

        self.global_t = 0
        self.aux_t = 0
        self.stop_requested = False
        self.terminate_requested = False
        logger.debug("getting action size and observation size...")
        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)
        obs_size = Environment.get_obs_size(flags.env_type, flags.env_name)
        # Setup Global Network
        logger.debug("loading global model...")
        self.global_network = UnrealModel(
            action_size,
            obs_size,
            -1,
            flags.entropy_beta,
            device,
            use_pixel_change=flags.use_pixel_change,
            use_value_replay=flags.use_value_replay,
            use_reward_prediction=flags.use_reward_prediction,
            use_temporal_coherence=flags.use_temporal_coherence,
            use_proportionality=flags.use_proportionality,
            use_causality=flags.use_causality,
            use_repeatability=flags.use_repeatability,
            value_lambda=flags.value_lambda,
            pixel_change_lambda=flags.pixel_change_lambda,
            temporal_coherence_lambda=flags.temporal_coherence_lambda,
            proportionality_lambda=flags.proportionality_lambda,
            causality_lambda=flags.causality_lambda,
            repeatability_lambda=flags.repeatability_lambda)
        logger.debug("done loading global model")
        learning_rate_input = tf.placeholder("float")

        # Setup gradient calculator
        #"""
        grad_applier = RMSPropApplier(
            learning_rate=learning_rate_input,
            #decay = flags.rmsp_alpha,
            momentum=0.0,
            #epsilon = flags.rmsp_epsilon,
            clip_norm=flags.grad_norm_clip,
            device=device)
        """
        grad_applier = AdamApplier(learning_rate = learning_rate_input,
                                   clip_norm=flags.grad_norm_clip,
                                   device=device)
        """
        # Start environment
        self.environment = Environment.create_environment(
            flags.env_type, flags.env_name)
        logger.debug("done loading environment")

        # Setup runner
        self.runner = RunnerThread(flags, self.environment,
                                   self.global_network, action_size, obs_size,
                                   device, visualise)
        logger.debug("done setting up RunnerTread")

        # Setup experience
        self.experience = Experience(flags.experience_history_size)

        #@TODO check device usage: should we build a cluster?
        # Setup Base Network
        self.base_trainer = BaseTrainer(
            self.runner, self.global_network, initial_learning_rate,
            learning_rate_input, grad_applier, flags.env_type, flags.env_name,
            flags.entropy_beta, flags.gamma, self.experience,
            flags.max_time_step, device, flags.value_lambda)

        # Setup Aux Networks
        self.aux_trainers = []
        for k in range(flags.parallel_size):
            self.aux_trainers.append(
                AuxTrainer(
                    self.global_network,
                    k + 2,  #-1 is global, 0 is runnerthread, 1 is base
                    flags.use_base,
                    flags.use_pixel_change,
                    flags.use_value_replay,
                    flags.use_reward_prediction,
                    flags.use_temporal_coherence,
                    flags.use_proportionality,
                    flags.use_causality,
                    flags.use_repeatability,
                    flags.value_lambda,
                    flags.pixel_change_lambda,
                    flags.temporal_coherence_lambda,
                    flags.proportionality_lambda,
                    flags.causality_lambda,
                    flags.repeatability_lambda,
                    flags.aux_initial_learning_rate,
                    learning_rate_input,
                    grad_applier,
                    self.aux_t,
                    flags.env_type,
                    flags.env_name,
                    flags.entropy_beta,
                    flags.local_t_max,
                    flags.gamma,
                    flags.aux_lambda,
                    flags.gamma_pc,
                    self.experience,
                    flags.max_time_step,
                    device))

        # Start tensorflow session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        self.init_tensorboard()

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            checkpointpath = checkpoint.model_checkpoint_path.replace(
                "/", "\\")
            logger.info("checkpoint loaded: {}".format(checkpointpath))
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            logger.info(">>> global step set: {}".format(self.global_t))
            logger.info(">>> aux step: {}".format(self.aux_t))
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step
                logger.debug("next save steps:{}".format(self.next_save_steps))
        else:
            logger.info("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t
        # Start runner
        self.runner.start_runner(self.sess)
        # Start base_network thread
        self.base_train_thread = threading.Thread(
            target=self.base_train_function, args=())
        self.base_train_thread.start()

        # Start aux_network threads
        self.aux_train_threads = []
        for k in range(flags.parallel_size):
            self.aux_train_threads.append(
                threading.Thread(target=self.aux_train_function, args=(k, )))
            self.aux_train_threads[k].start()

        logger.debug(threading.enumerate())

        logger.info('Press Ctrl+C to stop')
        signal.pause()
Пример #24
0
    def __init__(self, env, task, visualise):
        self.env = env
        self.task = task
        self.ob_shape = [HEIGHT, WIDTH, CHANNEL]
        self.action_n = Environment.get_action_size()
        # define the network stored in ps which is used to sync
        worker_device = '/job:worker/task:{}'.format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope('global'):
                self.experience = Experience(
                    EXPERIENCE_HISTORY_SIZE)  # exp replay pool
                self.network = UnrealModel(self.action_n, self.env,
                                           self.experience)
                self.global_step = tf.get_variable('global_step',
                                                   dtype=tf.int32,
                                                   initializer=tf.constant(
                                                       0, dtype=tf.int32),
                                                   trainable=False)
        # define the local network which is used to calculate the gradient
        with tf.device(worker_device):
            with tf.variable_scope('local'):
                self.local_network = net = UnrealModel(self.action_n, self.env,
                                                       self.experience)
                net.global_step = self.global_step

            # add summaries for losses and norms
            self.batch_size = tf.to_float(tf.shape(net.base_input)[0])
            base_loss = self.local_network.base_loss
            pc_loss = self.local_network.pc_loss
            rp_loss = self.local_network.rp_loss
            vr_loss = self.local_network.vr_loss
            entropy = tf.reduce_sum(self.local_network.entropy)
            self.loss = base_loss + pc_loss + rp_loss + vr_loss
            grads = tf.gradients(self.loss, net.var_list)
            tf.summary.scalar('model/a3c_loss', base_loss / self.batch_size)
            tf.summary.scalar('model/pc_loss', pc_loss / self.batch_size)
            tf.summary.scalar('model/rp_loss', rp_loss / self.batch_size)
            tf.summary.scalar('model/vr_loss', vr_loss / self.batch_size)
            tf.summary.scalar('model/grad_global_norm', tf.global_norm(grads))
            tf.summary.scalar('model/var_global_norm',
                              tf.global_norm(net.var_list))
            tf.summary.scalar('model/entropy', entropy / self.batch_size)
            tf.summary.image('model/state', net.base_input)
            self.summary_op = tf.summary.merge_all()

            # clip the gradients to avoid gradient explosion
            grads, _ = tf.clip_by_global_norm(grads, GRAD_NORM_CLIP)

            self.sync = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(net.var_list, self.network.var_list)
            ])
            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(tf.to_int32(
                self.batch_size))
            lr = log_uniform(LR_LOW, LR_HIGH)
            opt = tf.train.RMSPropOptimizer(learning_rate=lr,
                                            decay=RMSP_ALPHA,
                                            momentum=0.0,
                                            epsilon=RMSP_EPSILON)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)
            self.summary_writer = None
            self.local_step = 0
Пример #25
0
    def __init__(self, args):
        super(Runner, self).__init__()
        self.env_name = args.env_name
        self.log_path = os.path.join(args.log_path, args.env_name)
        self.max_time_step = args.max_time_step
        self.initial_alpha_low = args.initial_alpha_low
        self.initial_alpha_high = args.initial_alpha_high
        self.initial_alpha_log_rate = args.initial_alpha_log_rate
        self.use_pixel_change = args.use_pixel_change
        self.use_value_replay = args.use_value_replay
        self.use_reward_prediction = args.use_reward_prediction
        self.pixel_change_lambda = args.pixel_change_lambda
        self.entropy_beta = args.entropy_beta
        self.alpha = args.alpha
        self.epsilon = args.epsilon
        self.parallel_size = args.parallel_size
        self.local_t_max = args.parallel_size
        self.gamma = args.gamma
        self.gamma_pc = args.gamma_pc
        self.experience_history_size = args.experience_history_size
        self.save_interval_step = args.save_interval_step
        self.grad_norm_clip = args.grad_norm_clip
        initial_learning_rate = log_uniform(self.initial_alpha_low,
                                            self.initial_alpha_high,
                                            self.initial_alpha_log_rate)

        self.global_t = 0
        self.stop_requested = False
        self.terminate_reqested = False
        action_size = Environment.get_action_size(self.env_name)
        self.global_network = Agent(-1,
                                    action_size,
                                    self.use_pixel_change,
                                    self.use_value_replay,
                                    self.use_reward_prediction,
                                    self.pixel_change_lambda,
                                    self.entropy_beta,
                                    args.device)
        self.trainers = []
        self.global_network.share_memory()
        optimizor = SharedAdam(params=self.global_network.parameters(), lr=initial_learning_rate,
                               weight_decay=self.alpha,
                               eps=self.epsilon)

        for i in range(self.parallel_size):
            trainer = Trainer(i,
                              self.global_network,
                              initial_learning_rate,
                              self.env_name,
                              self.use_pixel_change,
                              self.use_value_replay,
                              self.use_reward_prediction,
                              self.pixel_change_lambda,
                              self.entropy_beta,
                              self.local_t_max,
                              self.gamma,
                              self.gamma_pc,
                              self.experience_history_size,
                              self.max_time_step,
                              self.grad_norm_clip,
                              optimizor,
                              args.device)
            self.trainers.append(trainer)

        self.summary_writer = SummaryWriter(self.log_path)
        self.saver = Saver(self.log_path)

        self.global_t, wall_t = self.saver.restore(self.global_network)
        # set global step
        print(">>> global step set: ", self.global_t)
        self.next_save_steps = self.save_interval_step if self.global_t == 0 \
            else (self.global_t + self.save_interval_step) // self.save_interval_step ** 2
        self.train_threads = []
        for i in range(self.parallel_size):
            self.train_threads.append(threading.Thread(target=self.train_function, args=(i, True)))
        self.start_time = time.time() - wall_t
Пример #26
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_lstm, use_pixel_change, use_value_replay,
                 use_reward_prediction, pixel_change_lambda, entropy_beta,
                 local_t_max, n_step_TD, gamma, gamma_pc,
                 experience_history_size, max_global_time_step, device,
                 segnet_param_dict, image_shape, is_training, n_classes,
                 random_state, termination_time, segnet_lambda, dropout):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_lstm = use_lstm
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.n_step_TD = n_step_TD
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)
        self.objective_size = Environment.get_objective_size(
            env_type, env_name)

        self.segnet_param_dict = segnet_param_dict
        self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None)

        self.is_training = is_training
        self.n_classes = n_classes
        self.segnet_lambda = segnet_lambda

        self.run_metadata = tf.RunMetadata()
        self.many_runs_timeline = TimeLiner()

        self.random_state = random_state
        self.termination_time = termination_time
        self.dropout = dropout

        try:
            self.local_network = UnrealModel(
                self.action_size,
                self.objective_size,
                thread_index,
                use_lstm,
                use_pixel_change,
                use_value_replay,
                use_reward_prediction,
                pixel_change_lambda,
                entropy_beta,
                device,
                segnet_param_dict=self.segnet_param_dict,
                image_shape=image_shape,
                is_training=is_training,
                n_classes=n_classes,
                segnet_lambda=self.segnet_lambda,
                dropout=dropout)

            self.local_network.prepare_loss()

            self.apply_gradients = grad_applier.minimize_local(
                self.local_network.total_loss, global_network.get_vars(),
                self.local_network.get_vars(), self.thread_index)

            self.sync = self.local_network.sync_from(global_network)
            self.experience = Experience(self.experience_history_size,
                                         random_state=self.random_state)
            self.local_t = 0
            self.initial_learning_rate = initial_learning_rate
            self.episode_reward = 0
            # For log output
            self.prev_local_t = -1
            self.prev_local_t_loss = 0
            self.sr_size = 50
            self.success_rates = deque(maxlen=self.sr_size)
        except Exception as e:
            print(str(e))  #, flush=True)
            raise Exception(
                "Problem in Trainer {} initialization".format(thread_index))
Пример #27
0
        if step % RECONSTRUCTION_CHECK_INTERVAL == 0:
            # Create reconstruction image
            vae_projection.check_reconstruction(sess, environment, 10,
                                                RECONSTRUCTION_IMAGE_DIR)


def train_episodic_control(agent):
    # TODO:
    for i in range(1):
        ret = agent.step()
        if ret != None:
            print(ret)


num_actions = Environment.get_action_size()
environment = Environment.create_environment()

vae_projection = VAEProjection()

qec_table = QECTable(vae_projection, state_dim, num_actions, k, knn_capacity)

agent = EpisodicControlAgent(environment, qec_table, num_actions, gamma,
                             epsilon)

# Session should be started after Lab environment is created. (To run Lab with GPU)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

vae_projection.set_session(sess)
Пример #28
0
    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"

        initial_learning_rate = log_uniform(flags.initial_alpha_low,
                                            flags.initial_alpha_high,
                                            flags.initial_alpha_log_rate)

        self.global_t = 0

        self.stop_requested = False
        self.terminate_reqested = False

        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)

        self.global_network = UnrealModel(action_size, -1,
                                          flags.use_pixel_change,
                                          flags.use_value_replay,
                                          flags.use_reward_prediction,
                                          flags.pixel_change_lambda,
                                          flags.entropy_beta, device)
        self.trainers = []

        learning_rate_input = tf.placeholder("float")

        grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                      decay=flags.rmsp_alpha,
                                      momentum=0.0,
                                      epsilon=flags.rmsp_epsilon,
                                      clip_norm=flags.grad_norm_clip,
                                      device=device)

        for i in range(flags.parallel_size):
            trainer = Trainer(i, self.global_network, initial_learning_rate,
                              learning_rate_input, grad_applier,
                              flags.env_type, flags.env_name,
                              flags.use_pixel_change, flags.use_value_replay,
                              flags.use_reward_prediction,
                              flags.pixel_change_lambda, flags.entropy_beta,
                              flags.local_t_max, flags.gamma, flags.gamma_pc,
                              flags.experience_history_size,
                              flags.max_time_step, device)
            self.trainers.append(trainer)

        flags.checkpoint_dir = sys.path[0] + flags.checkpoint_dir
        flags.log_file = sys.path[0] + flags.log_file
        # prepare session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        COORD = tf.train.Coordinator()

        print('before init')
        self.sess.run(tf.global_variables_initializer())
        print('after init')
        # summary for tensorboard
        self.score_input = tf.placeholder(tf.int32)
        self.vr_loss_input = tf.placeholder(tf.float32)
        self.rp_loss_input = tf.placeholder(tf.float32)

        self.summary_op_score = tf.summary.scalar("score", self.score_input)
        self.merge_vr = tf.summary.scalar("value_replay_loss",
                                          self.vr_loss_input)
        merge_rp = tf.summary.scalar("reward_prediction_loss",
                                     self.rp_loss_input)

        self.summary_op_loss = tf.summary.merge([self.merge_vr, merge_rp])

        # self.summary_op = tf.summary.merge_all()
        # self.summary_op_score = tf.summary.merge([self.score_input])
        # self.summary_op_loss = tf.summary.merge([self.vr_loss_input])
        self.summary_writer = tf.summary.FileWriter(flags.log_file,
                                                    self.sess.graph)

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded:", checkpoint.model_checkpoint_path)
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: ", self.global_t)
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step

        else:
            print("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        # run training threads
        # set start time
        self.start_time = time.time() - self.wall_t
        self.train_threads = []
        for i in range(flags.parallel_size):
            t = threading.Thread(target=self.train_function, args=(i, True))
            t.start()
            self.train_threads.append(t)

        signal.signal(signal.SIGINT, self.signal_handler)
        COORD.join(self.train_threads)

        # for t in self.train_threads:
        #   t.start()

        print('Press Ctrl+C to stop')
        signal.pause()
Пример #29
0
  return math.exp(v)

device = "/cpu:0"
if USE_GPU:
  device = "/gpu:0"

initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW,
                                    INITIAL_ALPHA_HIGH,
                                    INITIAL_ALPHA_LOG_RATE)

global_t = 0

stop_requested = False
terminate_reqested = False

action_size = Environment.get_action_size()
global_network = UnrealModel(action_size, -1, device)

trainers = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate = learning_rate_input,
                              decay = RMSP_ALPHA,
                              momentum = 0.0,
                              epsilon = RMSP_EPSILON,
                              clip_norm = GRAD_NORM_CLIP,
                              device = device)

for i in range(PARALLEL_SIZE):
  trainer = Trainer(i,
Пример #30
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               env_type,
               env_name,
               use_pixel_change,
               use_value_replay,
               use_reward_prediction,
               pixel_change_lambda,
               entropy_beta,
               vf_coeff,
               local_t_max,
               gamma,
               gamma_pc,
               experience_history_size,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.env_type = env_type
    self.env_name = env_name
    self.use_pixel_change = use_pixel_change
    self.use_value_replay = use_value_replay
    self.use_reward_prediction = use_reward_prediction
    self.local_t_max = local_t_max
    self.gamma = gamma
    self.gamma_pc = gamma_pc
    self.experience_history_size = experience_history_size
    self.max_global_time_step = max_global_time_step
    self.action_size = Environment.get_action_size(env_type, env_name)
    
    self.local_network = UnrealModel(self.action_size,
                                     thread_index,
                                     use_pixel_change,
                                     use_value_replay,
                                     use_reward_prediction,
                                     pixel_change_lambda,
                                     entropy_beta,
                                     vf_coeff,
                                     device)
    #self.local_network.prepare_loss()

    #adding things for acktr
    self.local_network.prepare_loss_acktr()

    self.optim = optim = KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

    update_stats_op = optim.compute_and_apply_stats(self.local_network.fisher_loss, var_list=self.local_network.params)
    train_op, q_runner = optim.apply_gradients(list(zip(self.local_network.grads,self.local_network.params)))

    

    #update the rest according to normal stuff
    self.apply_gradients = grad_applier.minimize_local(self.local_network.intermediate_loss,
                                                       global_network.get_vars(),
                                                       self.local_network.get_vars())
    
    self.sync = self.local_network.sync_from(global_network)
    self.experience = Experience(self.experience_history_size)
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    # For log output
    self.prev_local_t = 0