def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('UNREAL') self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'seed': flags.seed, # 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.episode_reward = 0 self.cnt_success = 0
def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) print('flags:use_pixel_change {}'.format(flags.use_pixel_change)) sleep(10) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0", for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) print('\n======\nENV in Evaluate::ctor') print(self.environment) print(self.global_network) print('val_replay!!! {}'.format(flags.use_value_replay)) print(flags.split) print('=======\n') sleep(10) self.episode_reward = 0
def __init__(self, thread_index, global_network, initial_learning_rate, env_args, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, spatial_dim, optimizor): self.thread_index = thread_index self.env_args = env_args self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size() self.local_network = Agent(thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta) self.global_network = global_network self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.spatial_dim = spatial_dim self.obs_processer = ObsProcesser() self.action_processer = ActionProcesser(dim=spatial_dim) self.optimizor = optimizor self.distribution = th.distributions.Categorical # For log output self.prev_local_t = 0 self.environment = Environment.create_environment(self.env_args)
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size() self.local_network = UnrealModel(self.action_size, thread_index, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.environment = Environment.create_environment() self.experience = Experience(EXPERIENCE_HISTORY_SIZE) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, grad_norm_clip, optimizor, device): self.thread_index = thread_index self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_name) self.local_network = Agent(thread_index, self.action_size, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.global_network = global_network self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.optimizor = optimizor self.distribution = torch.distributions.Categorical self.grad_norm_clip = grad_norm_clip self.prev_local_t = 0
def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) name = 'UNREAL' if flags.segnet == 0 else "A3C ErfNet" pygame.display.set_caption(name) env_config = sim_config.get(flags.env_name) self.image_shape = [ env_config.get('height', 88), env_config.get('width', 88) ] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") map_file = env_config.get('objecttypes_file', '../../objectTypes.csv') self.label_mapping = pd.read_csv(map_file, sep=',', header=0) self.get_col_index() self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size( flags.env_type, flags.env_name) self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/gpu:0", segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment( flags.env_type, flags.env_name, flags.termination_time_sec, env_args={ 'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene }) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0
def __init__(self): self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) self.action_size = Environment.get_action_size() self.global_network = UnrealModel(self.action_size, -1, "/cpu:0", for_display=True) self.env = Environment.create_environment() self.value_history = ValueHistory() self.state_history = StateHistory() self.ep_reward = 0 self.mazemap = MazeMap()
def __init__(self, runner, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, entropy_beta, gamma, experience, max_global_time_step, device, value_lambda): self.runner = runner self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.gamma = gamma self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.obs_size = Environment.get_obs_size(env_type, env_name) self.global_network = global_network self.local_network = UnrealModel(self.action_size, self.obs_size, 1, entropy_beta, device, value_lambda=value_lambda) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, self.global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(self.global_network, name="base_trainer") self.experience = experience self.local_t = 0 self.next_log_t = 0 self.next_performance_t = PERFORMANCE_LOG_INTERVAL self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # trackers for the experience replay creation self.last_state = None self.last_action = 0 self.last_reward = 0 self.ep_ploss = 0. self.ep_vloss = 0. self.ep_entr = [] self.ep_grad = [] self.ep_l = 0
def __init__(self, args, display_size, saver): pygame.init() self.args = args self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('UNREAL') args.action_size = Environment.get_action_size(args.env_name) self.global_network = Agent(1, args) saver.restore(self.global_network) self.global_network.eval() self.environment = Environment.create_environment(args.env_name) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.distribution = torch.distributions.Categorical self.episode_reward = 0
def test_step(self): environment = Environment.create_environment() action_size = Environment.get_action_size() if sys.platform == 'darwin': self.assertTrue(action_size == 6) else: self.assertTrue(action_size == 8) for i in range(3): self.assertTrue(environment.last_observation.shape == (84, 84)) if SAVE_IMAGE: scipy.misc.imsave("debug_observation{0}.png".format(i), environment.last_observation) reward, terminal = environment.step(0)
def main(args): action_size = Environment.get_action_size(flags.env_type, flags.env_name) objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) global_network = UnrealModel(action_size, objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, 0.0, "/cpu:0") # use CPU for weight visualize tool sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") vars = {} var_list = global_network.get_vars() for v in var_list: vars[v.name] = v W_conv1 = sess.run(vars['net_-1/base_conv/W_base_conv1:0']) # show graph of W_conv1 fig, axes = plt.subplots(3, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(3 * 16)): inch = i // 16 outch = i % 16 img = W_conv1[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show()
def __init__(self): self.action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) env_config = sim_config.get(flags.env_name) self.image_shape = [env_config['height'], env_config['width']] segnet_param_dict = {'segnet_mode': flags.segnet} is_training = tf.placeholder(tf.bool, name="training") # for display param in UnrealModel says its value self.global_network = UnrealModel(self.action_size, self.objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, 0.0, #flags.pixel_change_lambda 0.0, #flags.entropy_beta device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout, for_display=True) self.environment = Environment.create_environment(flags.env_type, flags.env_name, flags.termination_time_sec, env_args={'episode_schedule': flags.split, 'log_action_trace': flags.log_action_trace, 'max_states_per_scene': flags.episodes_per_scene, 'episodes_per_scene_test': flags.episodes_per_scene}) self.global_network.prepare_loss() self.total_loss = [] self.segm_loss = [] self.episode_reward = [0] self.episode_roomtype = [] self.roomType_dict = {} self.segnet_class_dict = {} self.success_rate = [] self.batch_size = 20 self.batch_cur_num = 0 self.batch_prev_num = 0 self.batch_si = [] self.batch_sobjT = [] self.batch_a = [] self.batch_reward = []
def __init__(self, display_size): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('UNREAL') self.action_size = Environment.get_action_size() self.global_network = UnrealModel(self.action_size, -1, "/cpu:0", for_display=True) self.environment = Environment.create_environment() self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.state_history = StateHistory() self.episode_reward = 0
def check_environment(self, env_type, env_name): env = Environment.create_environment(env_type, env_name, 0) action_size = Environment.get_action_size(env_type, env_name) for i in range(3): state, reward, terminal = env.process(0) print(state) print(reward) print(terminal) # # Check shape # self.assertTrue(state.shape == (84, 84, 3)) # # state and pixel_change value range should be [0,1] # self.assertTrue(np.amax(state) <= 1.0) env.stop()
def learn_flappyb(): env = Environment(draw=DRAW, fps=1, debug=False, dist_to_pipe=DIFFICULTY_LEARN, dist_between_pipes=DIST_BETWEEN_PIPES, obs_this_pipe=OBS_THIS_PIPE_LEARN) writer = None if WRITE: writer = SummaryWriter(comment=NAME) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() model = load_model('models/dqn/{}.h5'.format(LOAD_NAME)) dqn_solver = DQNSolver(observation_space, action_space, model) run = 0 if SAVE_MODEL: name = '{}-PART={}'.format(NAME, run) dqn_solver.model.save('models/dqn/{}.h5'.format(name)) while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 reward_score = 0 while True: step += 1 action = dqn_solver.act(state, env) state_next, reward, terminal, info = env.step_buffer(action) reward_score += reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(reward_score)) if WRITE: writer.add_scalar("reward", reward_score, run) break dqn_solver.experience_replay() if (run % 100 == 0) and SAVE_MODEL: name = '{}-PART={}'.format(NAME, run) dqn_solver.model.save('models/dqn/{}.h5'.format(name)) if WRITE: writer.close()
def __init__(self, display_size,model): pygame.init() self.surface = pygame.display.set_mode(display_size, 0, 24) pygame.display.set_caption('MAPREADER') self.action_size = Environment.get_action_size() self.global_network = model self.environment = Environment.create_environment(*DISPLAY_LEVEL) self.font = pygame.font.SysFont(None, 20) self.value_history = ValueHistory() self.step_count = 0 self.episode_reward = 0 self.episode_intrinsic_reward = 0 self.state = self.environment.last_state self.replan = True self.path = [] self.maze_size = DISPLAY_LEVEL[0]//40*2+7
def test_process(self): environment = Environment.create_environment() action_size = Environment.get_action_size() for i in range(3): state, reward, terminal, pixel_change = environment.process(0) # Check shape self.assertTrue(state.shape == (84, 84, 3)) self.assertTrue(environment.last_state.shape == (84, 84, 3)) self.assertTrue(pixel_change.shape == (20, 20)) # state and pixel_change value range should be [0,1] self.assertTrue(np.amax(state) <= 1.0) self.assertTrue(np.amin(state) >= 0.0) self.assertTrue(np.amax(pixel_change) <= 1.0) self.assertTrue(np.amin(pixel_change) >= 0.0)
def get_prediction(history, action, env_name, check_dir): action_size = Environment.get_action_size(env_type, env_name) global_network = UnrealModel( action_size, -1, #flags.use_pixel_change, #flags.use_value_replay, #flags.use_reward_prediction, #flags.use_future_reward_prediction, #flags.use_autoencoder, False, False, False, True, True, .0, .0, "/cpu:0") config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(check_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") feed_dict = { global_network.frp_input: np.zeros((4, 84, 84, 3)), global_network.frp_action_input: np.zeros((1, action_size)) } encoder_output = sess.run(global_network.encoder_output, feed_dict) print(encoder_output)
def play_flappyb(): env = Environment(draw=True, fps=1, debug=True, dist_to_pipe=DIFFICULTY_PLAY, dist_between_pipes=DIST_BETWEEN_PIPES, obs_this_pipe=OBS_THIS_PIPE_PLAY) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME)) dqn_solver = DQNSolver(observation_space, action_space, model) for i in range(20): state = env.reset() state = np.reshape(state, [1, observation_space]) is_done = False while not is_done: action = dqn_solver.act_free(state) # action = env.get_action_random() state_next, reward, terminal, info = env.step_buffer(action) is_done = terminal state = np.reshape(state_next, [1, observation_space])
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size() self.local_network = MapReaderModel(self.action_size, thread_index, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(EXPERIENCE_HISTORY_SIZE) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.maze_size = 5 if self.thread_index in range(2): self.maze_size = 13 elif self.thread_index in [2, 3]: self.maze_size = 11 elif self.thread_index in [4, 5]: self.maze_size = 9 elif self.thread_index in [6, 7]: self.maze_size = 7 self.level_seed = np.random.randint(LEVEL_SET_SIZE) # For log output self.prev_local_t = 0 self.last_terminal_local_t = 0 self.steps_buffer = deque() self.correct_exits = 0 self.running = True
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" logger.debug("start App") initial_learning_rate = flags.initial_learning_rate self.global_t = 0 self.aux_t = 0 self.stop_requested = False self.terminate_requested = False logger.debug("getting action size and observation size...") action_size = Environment.get_action_size(flags.env_type, flags.env_name) obs_size = Environment.get_obs_size(flags.env_type, flags.env_name) # Setup Global Network logger.debug("loading global model...") self.global_network = UnrealModel( action_size, obs_size, -1, flags.entropy_beta, device, use_pixel_change=flags.use_pixel_change, use_value_replay=flags.use_value_replay, use_reward_prediction=flags.use_reward_prediction, use_temporal_coherence=flags.use_temporal_coherence, use_proportionality=flags.use_proportionality, use_causality=flags.use_causality, use_repeatability=flags.use_repeatability, value_lambda=flags.value_lambda, pixel_change_lambda=flags.pixel_change_lambda, temporal_coherence_lambda=flags.temporal_coherence_lambda, proportionality_lambda=flags.proportionality_lambda, causality_lambda=flags.causality_lambda, repeatability_lambda=flags.repeatability_lambda) logger.debug("done loading global model") learning_rate_input = tf.placeholder("float") # Setup gradient calculator #""" grad_applier = RMSPropApplier( learning_rate=learning_rate_input, #decay = flags.rmsp_alpha, momentum=0.0, #epsilon = flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) """ grad_applier = AdamApplier(learning_rate = learning_rate_input, clip_norm=flags.grad_norm_clip, device=device) """ # Start environment self.environment = Environment.create_environment( flags.env_type, flags.env_name) logger.debug("done loading environment") # Setup runner self.runner = RunnerThread(flags, self.environment, self.global_network, action_size, obs_size, device, visualise) logger.debug("done setting up RunnerTread") # Setup experience self.experience = Experience(flags.experience_history_size) #@TODO check device usage: should we build a cluster? # Setup Base Network self.base_trainer = BaseTrainer( self.runner, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.entropy_beta, flags.gamma, self.experience, flags.max_time_step, device, flags.value_lambda) # Setup Aux Networks self.aux_trainers = [] for k in range(flags.parallel_size): self.aux_trainers.append( AuxTrainer( self.global_network, k + 2, #-1 is global, 0 is runnerthread, 1 is base flags.use_base, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.use_proportionality, flags.use_causality, flags.use_repeatability, flags.value_lambda, flags.pixel_change_lambda, flags.temporal_coherence_lambda, flags.proportionality_lambda, flags.causality_lambda, flags.repeatability_lambda, flags.aux_initial_learning_rate, learning_rate_input, grad_applier, self.aux_t, flags.env_type, flags.env_name, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.aux_lambda, flags.gamma_pc, self.experience, flags.max_time_step, device)) # Start tensorflow session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.init_tensorboard() # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) checkpointpath = checkpoint.model_checkpoint_path.replace( "/", "\\") logger.info("checkpoint loaded: {}".format(checkpointpath)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) logger.info(">>> global step set: {}".format(self.global_t)) logger.info(">>> aux step: {}".format(self.aux_t)) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step logger.debug("next save steps:{}".format(self.next_save_steps)) else: logger.info("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t # Start runner self.runner.start_runner(self.sess) # Start base_network thread self.base_train_thread = threading.Thread( target=self.base_train_function, args=()) self.base_train_thread.start() # Start aux_network threads self.aux_train_threads = [] for k in range(flags.parallel_size): self.aux_train_threads.append( threading.Thread(target=self.aux_train_function, args=(k, ))) self.aux_train_threads[k].start() logger.debug(threading.enumerate()) logger.info('Press Ctrl+C to stop') signal.pause()
def __init__(self, env, task, visualise): self.env = env self.task = task self.ob_shape = [HEIGHT, WIDTH, CHANNEL] self.action_n = Environment.get_action_size() # define the network stored in ps which is used to sync worker_device = '/job:worker/task:{}'.format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope('global'): self.experience = Experience( EXPERIENCE_HISTORY_SIZE) # exp replay pool self.network = UnrealModel(self.action_n, self.env, self.experience) self.global_step = tf.get_variable('global_step', dtype=tf.int32, initializer=tf.constant( 0, dtype=tf.int32), trainable=False) # define the local network which is used to calculate the gradient with tf.device(worker_device): with tf.variable_scope('local'): self.local_network = net = UnrealModel(self.action_n, self.env, self.experience) net.global_step = self.global_step # add summaries for losses and norms self.batch_size = tf.to_float(tf.shape(net.base_input)[0]) base_loss = self.local_network.base_loss pc_loss = self.local_network.pc_loss rp_loss = self.local_network.rp_loss vr_loss = self.local_network.vr_loss entropy = tf.reduce_sum(self.local_network.entropy) self.loss = base_loss + pc_loss + rp_loss + vr_loss grads = tf.gradients(self.loss, net.var_list) tf.summary.scalar('model/a3c_loss', base_loss / self.batch_size) tf.summary.scalar('model/pc_loss', pc_loss / self.batch_size) tf.summary.scalar('model/rp_loss', rp_loss / self.batch_size) tf.summary.scalar('model/vr_loss', vr_loss / self.batch_size) tf.summary.scalar('model/grad_global_norm', tf.global_norm(grads)) tf.summary.scalar('model/var_global_norm', tf.global_norm(net.var_list)) tf.summary.scalar('model/entropy', entropy / self.batch_size) tf.summary.image('model/state', net.base_input) self.summary_op = tf.summary.merge_all() # clip the gradients to avoid gradient explosion grads, _ = tf.clip_by_global_norm(grads, GRAD_NORM_CLIP) self.sync = tf.group(*[ v1.assign(v2) for v1, v2 in zip(net.var_list, self.network.var_list) ]) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.to_int32( self.batch_size)) lr = log_uniform(LR_LOW, LR_HIGH) opt = tf.train.RMSPropOptimizer(learning_rate=lr, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_step = 0
def __init__(self, args): super(Runner, self).__init__() self.env_name = args.env_name self.log_path = os.path.join(args.log_path, args.env_name) self.max_time_step = args.max_time_step self.initial_alpha_low = args.initial_alpha_low self.initial_alpha_high = args.initial_alpha_high self.initial_alpha_log_rate = args.initial_alpha_log_rate self.use_pixel_change = args.use_pixel_change self.use_value_replay = args.use_value_replay self.use_reward_prediction = args.use_reward_prediction self.pixel_change_lambda = args.pixel_change_lambda self.entropy_beta = args.entropy_beta self.alpha = args.alpha self.epsilon = args.epsilon self.parallel_size = args.parallel_size self.local_t_max = args.parallel_size self.gamma = args.gamma self.gamma_pc = args.gamma_pc self.experience_history_size = args.experience_history_size self.save_interval_step = args.save_interval_step self.grad_norm_clip = args.grad_norm_clip initial_learning_rate = log_uniform(self.initial_alpha_low, self.initial_alpha_high, self.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(self.env_name) self.global_network = Agent(-1, action_size, self.use_pixel_change, self.use_value_replay, self.use_reward_prediction, self.pixel_change_lambda, self.entropy_beta, args.device) self.trainers = [] self.global_network.share_memory() optimizor = SharedAdam(params=self.global_network.parameters(), lr=initial_learning_rate, weight_decay=self.alpha, eps=self.epsilon) for i in range(self.parallel_size): trainer = Trainer(i, self.global_network, initial_learning_rate, self.env_name, self.use_pixel_change, self.use_value_replay, self.use_reward_prediction, self.pixel_change_lambda, self.entropy_beta, self.local_t_max, self.gamma, self.gamma_pc, self.experience_history_size, self.max_time_step, self.grad_norm_clip, optimizor, args.device) self.trainers.append(trainer) self.summary_writer = SummaryWriter(self.log_path) self.saver = Saver(self.log_path) self.global_t, wall_t = self.saver.restore(self.global_network) # set global step print(">>> global step set: ", self.global_t) self.next_save_steps = self.save_interval_step if self.global_t == 0 \ else (self.global_t + self.save_interval_step) // self.save_interval_step ** 2 self.train_threads = [] for i in range(self.parallel_size): self.train_threads.append(threading.Thread(target=self.train_function, args=(i, True))) self.start_time = time.time() - wall_t
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, n_step_TD, gamma, gamma_pc, experience_history_size, max_global_time_step, device, segnet_param_dict, image_shape, is_training, n_classes, random_state, termination_time, segnet_lambda, dropout): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_lstm = use_lstm self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.n_step_TD = n_step_TD self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.objective_size = Environment.get_objective_size( env_type, env_name) self.segnet_param_dict = segnet_param_dict self.segnet_mode = self.segnet_param_dict.get("segnet_mode", None) self.is_training = is_training self.n_classes = n_classes self.segnet_lambda = segnet_lambda self.run_metadata = tf.RunMetadata() self.many_runs_timeline = TimeLiner() self.random_state = random_state self.termination_time = termination_time self.dropout = dropout try: self.local_network = UnrealModel( self.action_size, self.objective_size, thread_index, use_lstm, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device, segnet_param_dict=self.segnet_param_dict, image_shape=image_shape, is_training=is_training, n_classes=n_classes, segnet_lambda=self.segnet_lambda, dropout=dropout) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars(), self.thread_index) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size, random_state=self.random_state) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = -1 self.prev_local_t_loss = 0 self.sr_size = 50 self.success_rates = deque(maxlen=self.sr_size) except Exception as e: print(str(e)) #, flush=True) raise Exception( "Problem in Trainer {} initialization".format(thread_index))
if step % RECONSTRUCTION_CHECK_INTERVAL == 0: # Create reconstruction image vae_projection.check_reconstruction(sess, environment, 10, RECONSTRUCTION_IMAGE_DIR) def train_episodic_control(agent): # TODO: for i in range(1): ret = agent.step() if ret != None: print(ret) num_actions = Environment.get_action_size() environment = Environment.create_environment() vae_projection = VAEProjection() qec_table = QECTable(vae_projection, state_dim, num_actions, k, knn_capacity) agent = EpisodicControlAgent(environment, qec_table, num_actions, gamma, epsilon) # Session should be started after Lab environment is created. (To run Lab with GPU) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) vae_projection.set_session(sess)
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(action_size, -1, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=flags.rmsp_alpha, momentum=0.0, epsilon=flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) for i in range(flags.parallel_size): trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device) self.trainers.append(trainer) flags.checkpoint_dir = sys.path[0] + flags.checkpoint_dir flags.log_file = sys.path[0] + flags.log_file # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) COORD = tf.train.Coordinator() print('before init') self.sess.run(tf.global_variables_initializer()) print('after init') # summary for tensorboard self.score_input = tf.placeholder(tf.int32) self.vr_loss_input = tf.placeholder(tf.float32) self.rp_loss_input = tf.placeholder(tf.float32) self.summary_op_score = tf.summary.scalar("score", self.score_input) self.merge_vr = tf.summary.scalar("value_replay_loss", self.vr_loss_input) merge_rp = tf.summary.scalar("reward_prediction_loss", self.rp_loss_input) self.summary_op_loss = tf.summary.merge([self.merge_vr, merge_rp]) # self.summary_op = tf.summary.merge_all() # self.summary_op_score = tf.summary.merge([self.score_input]) # self.summary_op_loss = tf.summary.merge([self.vr_loss_input]) self.summary_writer = tf.summary.FileWriter(flags.log_file, self.sess.graph) # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step # run training threads # set start time self.start_time = time.time() - self.wall_t self.train_threads = [] for i in range(flags.parallel_size): t = threading.Thread(target=self.train_function, args=(i, True)) t.start() self.train_threads.append(t) signal.signal(signal.SIGINT, self.signal_handler) COORD.join(self.train_threads) # for t in self.train_threads: # t.start() print('Press Ctrl+C to stop') signal.pause()
return math.exp(v) device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) global_t = 0 stop_requested = False terminate_reqested = False action_size = Environment.get_action_size() global_network = UnrealModel(action_size, -1, device) trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) for i in range(PARALLEL_SIZE): trainer = Trainer(i,
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, vf_coeff, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, vf_coeff, device) #self.local_network.prepare_loss() #adding things for acktr self.local_network.prepare_loss_acktr() self.optim = optim = KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(self.local_network.fisher_loss, var_list=self.local_network.params) train_op, q_runner = optim.apply_gradients(list(zip(self.local_network.grads,self.local_network.params))) #update the rest according to normal stuff self.apply_gradients = grad_applier.minimize_local(self.local_network.intermediate_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0