Exemplo n.º 1
0
    def __init__(self,
               runner,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               env_type,
               env_name,
               entropy_beta,
               gamma,
               experience,
               max_global_time_step,
               device,
               value_lambda):
        self.runner = runner
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.gamma = gamma
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)
        self.obs_size = Environment.get_obs_size(env_type, env_name)
        self.global_network = global_network
        self.local_network = UnrealModel(self.action_size,
                                         self.obs_size,
                                         1,
                                         entropy_beta,
                                         device,
                                         value_lambda=value_lambda)

        self.local_network.prepare_loss()
        
        self.apply_gradients = grad_applier.minimize_local(self.local_network.total_loss,
                                                                    self.global_network.get_vars(),
                                                                     self.local_network.get_vars())
        self.sync = self.local_network.sync_from(self.global_network, name="base_trainer")
        self.experience = experience
        self.local_t = 0
        self.next_log_t = 0
        self.next_performance_t = PERFORMANCE_LOG_INTERVAL
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # trackers for the experience replay creation
        self.last_state = None
        self.last_action = 0
        self.last_reward = 0
        self.ep_ploss = 0.
        self.ep_vloss = 0.
        self.ep_entr = []
        self.ep_grad = []
        self.ep_l = 0
Exemplo n.º 2
0
    def run(self):
        device = "/cpu:0"
        if USE_GPU:
            device = "/gpu:0"
        logger.debug("start App")
        initial_learning_rate = flags.initial_learning_rate

        self.global_t = 0
        self.aux_t = 0
        self.stop_requested = False
        self.terminate_requested = False
        logger.debug("getting action size and observation size...")
        action_size = Environment.get_action_size(flags.env_type,
                                                  flags.env_name)
        obs_size = Environment.get_obs_size(flags.env_type, flags.env_name)
        # Setup Global Network
        logger.debug("loading global model...")
        self.global_network = UnrealModel(
            action_size,
            obs_size,
            -1,
            flags.entropy_beta,
            device,
            use_pixel_change=flags.use_pixel_change,
            use_value_replay=flags.use_value_replay,
            use_reward_prediction=flags.use_reward_prediction,
            use_temporal_coherence=flags.use_temporal_coherence,
            use_proportionality=flags.use_proportionality,
            use_causality=flags.use_causality,
            use_repeatability=flags.use_repeatability,
            value_lambda=flags.value_lambda,
            pixel_change_lambda=flags.pixel_change_lambda,
            temporal_coherence_lambda=flags.temporal_coherence_lambda,
            proportionality_lambda=flags.proportionality_lambda,
            causality_lambda=flags.causality_lambda,
            repeatability_lambda=flags.repeatability_lambda)
        logger.debug("done loading global model")
        learning_rate_input = tf.placeholder("float")

        # Setup gradient calculator
        #"""
        grad_applier = RMSPropApplier(
            learning_rate=learning_rate_input,
            #decay = flags.rmsp_alpha,
            momentum=0.0,
            #epsilon = flags.rmsp_epsilon,
            clip_norm=flags.grad_norm_clip,
            device=device)
        """
        grad_applier = AdamApplier(learning_rate = learning_rate_input,
                                   clip_norm=flags.grad_norm_clip,
                                   device=device)
        """
        # Start environment
        self.environment = Environment.create_environment(
            flags.env_type, flags.env_name)
        logger.debug("done loading environment")

        # Setup runner
        self.runner = RunnerThread(flags, self.environment,
                                   self.global_network, action_size, obs_size,
                                   device, visualise)
        logger.debug("done setting up RunnerTread")

        # Setup experience
        self.experience = Experience(flags.experience_history_size)

        #@TODO check device usage: should we build a cluster?
        # Setup Base Network
        self.base_trainer = BaseTrainer(
            self.runner, self.global_network, initial_learning_rate,
            learning_rate_input, grad_applier, flags.env_type, flags.env_name,
            flags.entropy_beta, flags.gamma, self.experience,
            flags.max_time_step, device, flags.value_lambda)

        # Setup Aux Networks
        self.aux_trainers = []
        for k in range(flags.parallel_size):
            self.aux_trainers.append(
                AuxTrainer(
                    self.global_network,
                    k + 2,  #-1 is global, 0 is runnerthread, 1 is base
                    flags.use_base,
                    flags.use_pixel_change,
                    flags.use_value_replay,
                    flags.use_reward_prediction,
                    flags.use_temporal_coherence,
                    flags.use_proportionality,
                    flags.use_causality,
                    flags.use_repeatability,
                    flags.value_lambda,
                    flags.pixel_change_lambda,
                    flags.temporal_coherence_lambda,
                    flags.proportionality_lambda,
                    flags.causality_lambda,
                    flags.repeatability_lambda,
                    flags.aux_initial_learning_rate,
                    learning_rate_input,
                    grad_applier,
                    self.aux_t,
                    flags.env_type,
                    flags.env_name,
                    flags.entropy_beta,
                    flags.local_t_max,
                    flags.gamma,
                    flags.aux_lambda,
                    flags.gamma_pc,
                    self.experience,
                    flags.max_time_step,
                    device))

        # Start tensorflow session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        self.init_tensorboard()

        # init or load checkpoint with saver
        self.saver = tf.train.Saver(self.global_network.get_vars())

        checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir)
        if CONTINUE_TRAINING and checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            checkpointpath = checkpoint.model_checkpoint_path.replace(
                "/", "\\")
            logger.info("checkpoint loaded: {}".format(checkpointpath))
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            logger.info(">>> global step set: {}".format(self.global_t))
            logger.info(">>> aux step: {}".format(self.aux_t))
            # set wall time
            wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(
                self.global_t)
            with open(wall_t_fname, 'r') as f:
                self.wall_t = float(f.read())
                self.next_save_steps = (
                    self.global_t + flags.save_interval_step
                ) // flags.save_interval_step * flags.save_interval_step
                logger.debug("next save steps:{}".format(self.next_save_steps))
        else:
            logger.info("Could not find old checkpoint")
            # set wall time
            self.wall_t = 0.0
            self.next_save_steps = flags.save_interval_step

        signal.signal(signal.SIGINT, self.signal_handler)

        # set start time
        self.start_time = time.time() - self.wall_t
        # Start runner
        self.runner.start_runner(self.sess)
        # Start base_network thread
        self.base_train_thread = threading.Thread(
            target=self.base_train_function, args=())
        self.base_train_thread.start()

        # Start aux_network threads
        self.aux_train_threads = []
        for k in range(flags.parallel_size):
            self.aux_train_threads.append(
                threading.Thread(target=self.aux_train_function, args=(k, )))
            self.aux_train_threads[k].start()

        logger.debug(threading.enumerate())

        logger.info('Press Ctrl+C to stop')
        signal.pause()
Exemplo n.º 3
0
    def __init__(self, global_network, thread_index, use_base,
                 use_pixel_change, use_value_replay, use_reward_prediction,
                 use_temporal_coherence, use_proportionality, use_causality,
                 use_repeatability, value_lambda, pixel_change_lambda,
                 temporal_coherence_lambda, proportionality_lambda,
                 causality_lambda, repeatability_lambda, initial_learning_rate,
                 learning_rate_input, grad_applier, aux_t, env_type, env_name,
                 entropy_beta, local_t_max, gamma, aux_lambda, gamma_pc,
                 experience, max_global_time_step, device):

        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.use_temporal_coherence = use_temporal_coherence
        self.use_proportionality = use_proportionality
        self.use_causality = use_causality
        self.use_repeatability = use_repeatability
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.entropy_beta = entropy_beta
        self.local_t = 0
        self.next_sync_t = 0
        self.next_log_t = 0
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.aux_lambda = aux_lambda
        self.gamma_pc = gamma_pc
        self.experience = experience
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)
        self.obs_size = Environment.get_obs_size(env_type, env_name)
        self.thread_index = thread_index
        self.local_network = UnrealModel(
            self.action_size,
            self.obs_size,
            self.thread_index,
            self.entropy_beta,
            device,
            use_pixel_change=use_pixel_change,
            use_value_replay=use_value_replay,
            use_reward_prediction=use_reward_prediction,
            use_temporal_coherence=use_temporal_coherence,
            use_proportionality=use_proportionality,
            use_causality=use_causality,
            use_repeatability=use_repeatability,
            value_lambda=value_lambda,
            pixel_change_lambda=pixel_change_lambda,
            temporal_coherence_lambda=temporal_coherence_lambda,
            proportionality_lambda=proportionality_lambda,
            causality_lambda=causality_lambda,
            repeatability_lambda=repeatability_lambda,
            for_display=False,
            use_base=use_base)

        self.local_network.prepare_loss()
        self.global_network = global_network

        #logger.debug("ln.total_loss:{}".format(self.local_network.total_loss))

        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.total_loss, self.global_network.get_vars(),
            self.local_network.get_vars())
        self.sync = self.local_network.sync_from(self.global_network,
                                                 name="aux_trainer_{}".format(
                                                     self.thread_index))
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # trackers for the experience replay creation
        self.last_action = np.zeros(self.action_size)
        self.last_reward = 0

        self.aux_losses = []
        self.aux_losses.append(self.local_network.policy_loss)
        self.aux_losses.append(self.local_network.value_loss)
        if self.use_pixel_change:
            self.aux_losses.append(self.local_network.pc_loss)
        if self.use_value_replay:
            self.aux_losses.append(self.local_network.vr_loss)
        if self.use_reward_prediction:
            self.aux_losses.append(self.local_network.rp_loss)
        if self.use_temporal_coherence:
            self.aux_losses.append(self.local_network.tc_loss)
        if self.use_proportionality:
            self.aux_losses.append(self.local_network.prop_loss)
        if self.use_causality:
            self.aux_losses.append(self.local_network.caus_loss)
        if self.use_repeatability:
            self.aux_losses.append(self.local_network.rep_loss)