def __init__(self, runner, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, entropy_beta, gamma, experience, max_global_time_step, device, value_lambda): self.runner = runner self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.gamma = gamma self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.obs_size = Environment.get_obs_size(env_type, env_name) self.global_network = global_network self.local_network = UnrealModel(self.action_size, self.obs_size, 1, entropy_beta, device, value_lambda=value_lambda) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss, self.global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(self.global_network, name="base_trainer") self.experience = experience self.local_t = 0 self.next_log_t = 0 self.next_performance_t = PERFORMANCE_LOG_INTERVAL self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # trackers for the experience replay creation self.last_state = None self.last_action = 0 self.last_reward = 0 self.ep_ploss = 0. self.ep_vloss = 0. self.ep_entr = [] self.ep_grad = [] self.ep_l = 0
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" logger.debug("start App") initial_learning_rate = flags.initial_learning_rate self.global_t = 0 self.aux_t = 0 self.stop_requested = False self.terminate_requested = False logger.debug("getting action size and observation size...") action_size = Environment.get_action_size(flags.env_type, flags.env_name) obs_size = Environment.get_obs_size(flags.env_type, flags.env_name) # Setup Global Network logger.debug("loading global model...") self.global_network = UnrealModel( action_size, obs_size, -1, flags.entropy_beta, device, use_pixel_change=flags.use_pixel_change, use_value_replay=flags.use_value_replay, use_reward_prediction=flags.use_reward_prediction, use_temporal_coherence=flags.use_temporal_coherence, use_proportionality=flags.use_proportionality, use_causality=flags.use_causality, use_repeatability=flags.use_repeatability, value_lambda=flags.value_lambda, pixel_change_lambda=flags.pixel_change_lambda, temporal_coherence_lambda=flags.temporal_coherence_lambda, proportionality_lambda=flags.proportionality_lambda, causality_lambda=flags.causality_lambda, repeatability_lambda=flags.repeatability_lambda) logger.debug("done loading global model") learning_rate_input = tf.placeholder("float") # Setup gradient calculator #""" grad_applier = RMSPropApplier( learning_rate=learning_rate_input, #decay = flags.rmsp_alpha, momentum=0.0, #epsilon = flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) """ grad_applier = AdamApplier(learning_rate = learning_rate_input, clip_norm=flags.grad_norm_clip, device=device) """ # Start environment self.environment = Environment.create_environment( flags.env_type, flags.env_name) logger.debug("done loading environment") # Setup runner self.runner = RunnerThread(flags, self.environment, self.global_network, action_size, obs_size, device, visualise) logger.debug("done setting up RunnerTread") # Setup experience self.experience = Experience(flags.experience_history_size) #@TODO check device usage: should we build a cluster? # Setup Base Network self.base_trainer = BaseTrainer( self.runner, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.entropy_beta, flags.gamma, self.experience, flags.max_time_step, device, flags.value_lambda) # Setup Aux Networks self.aux_trainers = [] for k in range(flags.parallel_size): self.aux_trainers.append( AuxTrainer( self.global_network, k + 2, #-1 is global, 0 is runnerthread, 1 is base flags.use_base, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.use_temporal_coherence, flags.use_proportionality, flags.use_causality, flags.use_repeatability, flags.value_lambda, flags.pixel_change_lambda, flags.temporal_coherence_lambda, flags.proportionality_lambda, flags.causality_lambda, flags.repeatability_lambda, flags.aux_initial_learning_rate, learning_rate_input, grad_applier, self.aux_t, flags.env_type, flags.env_name, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.aux_lambda, flags.gamma_pc, self.experience, flags.max_time_step, device)) # Start tensorflow session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.init_tensorboard() # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) checkpointpath = checkpoint.model_checkpoint_path.replace( "/", "\\") logger.info("checkpoint loaded: {}".format(checkpointpath)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) logger.info(">>> global step set: {}".format(self.global_t)) logger.info(">>> aux step: {}".format(self.aux_t)) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step logger.debug("next save steps:{}".format(self.next_save_steps)) else: logger.info("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t # Start runner self.runner.start_runner(self.sess) # Start base_network thread self.base_train_thread = threading.Thread( target=self.base_train_function, args=()) self.base_train_thread.start() # Start aux_network threads self.aux_train_threads = [] for k in range(flags.parallel_size): self.aux_train_threads.append( threading.Thread(target=self.aux_train_function, args=(k, ))) self.aux_train_threads[k].start() logger.debug(threading.enumerate()) logger.info('Press Ctrl+C to stop') signal.pause()
def __init__(self, global_network, thread_index, use_base, use_pixel_change, use_value_replay, use_reward_prediction, use_temporal_coherence, use_proportionality, use_causality, use_repeatability, value_lambda, pixel_change_lambda, temporal_coherence_lambda, proportionality_lambda, causality_lambda, repeatability_lambda, initial_learning_rate, learning_rate_input, grad_applier, aux_t, env_type, env_name, entropy_beta, local_t_max, gamma, aux_lambda, gamma_pc, experience, max_global_time_step, device): self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.use_temporal_coherence = use_temporal_coherence self.use_proportionality = use_proportionality self.use_causality = use_causality self.use_repeatability = use_repeatability self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.entropy_beta = entropy_beta self.local_t = 0 self.next_sync_t = 0 self.next_log_t = 0 self.local_t_max = local_t_max self.gamma = gamma self.aux_lambda = aux_lambda self.gamma_pc = gamma_pc self.experience = experience self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.obs_size = Environment.get_obs_size(env_type, env_name) self.thread_index = thread_index self.local_network = UnrealModel( self.action_size, self.obs_size, self.thread_index, self.entropy_beta, device, use_pixel_change=use_pixel_change, use_value_replay=use_value_replay, use_reward_prediction=use_reward_prediction, use_temporal_coherence=use_temporal_coherence, use_proportionality=use_proportionality, use_causality=use_causality, use_repeatability=use_repeatability, value_lambda=value_lambda, pixel_change_lambda=pixel_change_lambda, temporal_coherence_lambda=temporal_coherence_lambda, proportionality_lambda=proportionality_lambda, causality_lambda=causality_lambda, repeatability_lambda=repeatability_lambda, for_display=False, use_base=use_base) self.local_network.prepare_loss() self.global_network = global_network #logger.debug("ln.total_loss:{}".format(self.local_network.total_loss)) self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, self.global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(self.global_network, name="aux_trainer_{}".format( self.thread_index)) self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # trackers for the experience replay creation self.last_action = np.zeros(self.action_size) self.last_reward = 0 self.aux_losses = [] self.aux_losses.append(self.local_network.policy_loss) self.aux_losses.append(self.local_network.value_loss) if self.use_pixel_change: self.aux_losses.append(self.local_network.pc_loss) if self.use_value_replay: self.aux_losses.append(self.local_network.vr_loss) if self.use_reward_prediction: self.aux_losses.append(self.local_network.rp_loss) if self.use_temporal_coherence: self.aux_losses.append(self.local_network.tc_loss) if self.use_proportionality: self.aux_losses.append(self.local_network.prop_loss) if self.use_causality: self.aux_losses.append(self.local_network.caus_loss) if self.use_repeatability: self.aux_losses.append(self.local_network.rep_loss)