예제 #1
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                    schedule=self.lr_schedule)

            runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs,))
            # Training stats (when using Monitor wrapper)
            ep_info_buf = deque(maxlen=100)

            t_start = time.time()
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run()
                ep_info_buf.extend(ep_infos)

                _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
                                                                 self.num_timesteps // self.n_batch, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, self.num_timesteps)

                self.num_timesteps += self.n_batch

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps", self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy", float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.dump_tabular()

        return self
예제 #2
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="CLAC",
              reset_num_timesteps=True,
              randomization=0):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            learning_results = pd.DataFrame()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            reward_data = pd.DataFrame()

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if self.num_timesteps < self.learning_starts:
                    if (isinstance(self.env.action_space, Discrete)):
                        action = []
                        for _ in range(self.env.action_space.n):
                            action.append(1 / self.env.action_space.n)
                        rescaled_action = self.env.action_space.sample()
                    else:
                        action = self.env.action_space.sample()
                        # No need to rescale when sampling random action
                        rescaled_action = action
                else:
                    if (isinstance(self.env.action_space, Discrete)):
                        actions = list(range(self.env.action_space.n))
                        action = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()
                        rescaled_action = np.random.choice(actions,
                                                           1,
                                                           p=action)[0]
                    else:
                        action = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()
                        # Rescale from [-1, 1] to the correct bounds
                        rescaled_action = action * np.abs(
                            self.action_space.low)

                if (not isinstance(self.env.action_space, Discrete)):
                    assert action.shape == self.env.action_space.shape

                # If coinrunner environment
                # rescaled_action = np.array(rescaled_action, ndmin=1)

                new_obs, reward, done, info = self.env.step(rescaled_action)

                act_mu, act_std = self.policy_tf.proba_step(obs[None])

                if (len(act_std) == 1):
                    act_std = act_std[0]

                #print("ACT MU FROM PROBA STEP", act_mu)
                #print("ACT STD FROM PROBA STEP", act_std)
                if self.num_timesteps > self.learning_starts:
                    # Only update marginal approximation after learning starts is completed
                    if (self.multivariate_mean is None):
                        self.multivariate_mean = act_mu
                    else:
                        previous_mean = self.multivariate_mean
                        self.multivariate_mean = (
                            (1 - self.learning_rate_phi) *
                            self.multivariate_mean) + (self.learning_rate_phi *
                                                       act_mu)
                    if (self.multivariate_cov is None):
                        self.multivariate_cov = np.diag(act_std)
                    else:
                        cov = (self.learning_rate_phi * np.diag(act_std) +
                               (1 - self.learning_rate_phi) *
                               self.multivariate_cov)
                        mom_1 = (self.learning_rate_phi *
                                 np.square(np.diag(act_mu))) + (
                                     (1 - self.learning_rate_phi) *
                                     np.square(np.diag(previous_mean)))
                        mom_2 = np.square((self.learning_rate_phi *
                                           np.diag(act_mu)) +
                                          (1 - self.learning_rate_phi) *
                                          np.diag(previous_mean))
                        self.multivariate_cov = cov + mom_1 - mom_2

                    # Update Beta parameter if coef_schedule is set
                    if (self.coef_schedule is not None
                            and self.mut_inf_coef > 1e-12):
                        # (1 - a) B + a(1/L()) # Loss based update schdule, for later

                        # Currently using linear schedule:
                        self.mut_inf_coef *= (1 - self.coef_schedule)
                    """if(self.num_timesteps % 1000 == 0):
                        print("updated mut_inf_coef: ", self.mut_inf_coef, " at time step ", self.num_timesteps)"""

                # Store transition in the replay buffer.
                #print("adding action to replay buffer: ", action)
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                # info = info[0]
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        for mb_info_val in mb_infos_vals:
                            for mb_info in mb_info_val:
                                if mb_info is not None:
                                    infos_values.append(np.mean(mb_info))
                        #infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()

                        if (randomization == 1):
                            try:
                                for env in self.env.unwrapped.envs:
                                    env.randomize()
                            except:
                                print(
                                    "Trying to randomize an environment that is not set up for randomization, check environment file"
                                )
                                assert (False)

                        if (randomization == 2):
                            try:
                                for env in self.env.unwrapped.envs:
                                    env.randomize_extreme()
                            except:
                                print(
                                    "Trying to extremely randomize an environment that is not set up for randomization, check environment file"
                                )
                                assert (False)

                    Model_String = "CLAC"
                    if not self.auto_mut_inf_coef:
                        Model_String = "CLAC " + str(self.mut_inf_coef)

                    env_name = self.env.unwrapped.envs[0].spec.id

                    mut_inf_coef = self.init_mut_inf_coef
                    if (type(self.mut_inf_coef) == tf.Tensor
                            or np.isnan(mut_inf_coef)):
                        mut_inf_coef = "auto"
                    Model_String = "CLAC" + str(mut_inf_coef)
                    d = {
                        'Episode Reward': episode_rewards[-1],
                        'Coefficient': mut_inf_coef,
                        'Timestep': self.num_timesteps,
                        'Episode Number': len(episode_rewards) - 1,
                        'Env': env_name,
                        'Randomization': randomization,
                        'Model': "CLAC"
                    }
                    learning_results = learning_results.append(
                        d, ignore_index=True)

                    self.tf_logged_reward = episode_rewards[-1]

                    episode_rewards.append(0.0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return (self, learning_results)
예제 #3
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            tra_obs = []
            ep_count = 0
            selected_goal = None
            tra_count = 0
            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape
                new_obs, reward, done, info = self.env.step(rescaled_action)

                #################################################################
                # fit density model and update goal proposing model
                skew_explore_obs = obs.copy()
                if isinstance(self.env, HERGoalEnvWrapper):
                    skew_explore_obs_dict = self.env.convert_obs_to_dict(
                        skew_explore_obs)
                    skew_explore_obs = np.array(
                        [skew_explore_obs_dict['observation']])
                    tra_obs.append(skew_explore_obs[0])
                    if selected_goal is None:
                        selected_goal = np.array(
                            skew_explore_obs_dict['desired_goal'])
                else:
                    tra_obs.append(skew_explore_obs)

                self.skew_explore.update_history(skew_explore_obs, [done])
                if (step % self.goal_update_frequency == 0
                        and step != 0) or step == 2000:
                    logging.info('update buffer')
                    self.skew_explore.activate_buffer()
                #################################################################

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    self.plot_tra(tra_count, tra_obs, selected_goal)
                    tra_obs = []
                    selected_goal = None

                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()

                    ep_count += 1
                    episode_rewards.append(0.0)
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                    tra_count += 1
                    self.save(self.args.save_path + '/model')

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #4
0
    def learn(self, total_timesteps, callback=None, seed=None,
              log_interval=4, tb_log_name="SAC", reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            obs = self.env.reset()
            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            obs = self.env.reset()
            for i in range(128):
                action = self.env.action_space.sample()
                new_obs, reward, done, info = self.env.step(action)
                # print(new_obs)
                # self.env.render()
                self.iiayn.update_history([obs])
                obs = new_obs

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if self.num_timesteps < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action #* np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)

                print(step, action)
                # self.env.render()

                self.iiayn.update_history([obs])

                if step % 2048 == 0:
                    self.iiayn.activate_buffer()

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs, float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done or step%1024 == 0:
                    obs = self.env.reset()
                    # if not isinstance(self.env, VecEnv):
                    #     obs = self.env.reset()
                    episode_rewards.append(0.0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #5
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=2000,
              tb_log_name="MDPO",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose):

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)
            frac = 0
            t_k = 0

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(obs[None],
                                                 deterministic=True).flatten()
                    #action = self.policy_tf.step(obs[None], deterministic=True).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                #print("action", action)
                new_obs, reward, done, info = self.env.step(rescaled_action)
                #print("new obs", new_obs)

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done), info)
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(1):  #int(self.gradient_steps)
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                            or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        t_k = self.klconst  # step / total_timesteps
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, current_lr, t_k))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                if step % self.gradient_steps == 0:
                    self.sess.run(self.assign_policy_op)

                episode_rewards[-1] += reward
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and step % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("lamda", self.lamda)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.logkv("t_k", t_k)
                    logger.logkv("steps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #6
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):

        self.pretrained_weight = self.load_weight()
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
            )

            self.set_ewc_model(runner)

            restores = []
            for param, loaded_p in zip(self.params, self.pretrained_weight):
                restores.append(param.assign(loaded_p))
            self.sess.run(restores)

            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            flag_ewc = False
            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0

                if (update > 8.e5 // self.n_batch):
                    flag_ewc = True

                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount

                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 writer=writer,
                                                 update=timestep,
                                                 ewc=flag_ewc))

                    self.num_timesteps += (self.n_batch * self.noptepochs
                                           ) // batch_size * update_fac
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states,
                                                 ewc=flag_ewc))
                    self.num_timesteps += (self.n_envs * self.noptepochs
                                           ) // envs_per_batch * update_fac
                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
예제 #7
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope(
                        "kfac_apply",
                        reuse=self.trained,
                        custom_getter=tf_util.outer_scope_getter(
                            "kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized) if not f
                    ]

                    self.train_op, self.q_runner = self.optim.apply_gradients(
                        list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized)
                        if not f and v not in old_uninitialized_vars
                    ]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(
                            tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            # Use GAE
            if self.gae_lambda is not None:
                runner = PPO2Runner(env=self.env,
                                    model=self,
                                    n_steps=self.n_steps,
                                    gamma=self.gamma,
                                    lam=self.gae_lambda)
            else:
                runner = A2CRunner(self.env,
                                   self,
                                   n_steps=self.n_steps,
                                   gamma=self.gamma)

            self.episode_reward = np.zeros((self.n_envs, ))

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess,
                                                               coord=coord,
                                                               start=True)
            else:
                enqueue_threads = []

            # Training stats (when using Monitor wrapper)
            ep_info_buf = deque(maxlen=100)

            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                if isinstance(runner, PPO2Runner):
                    # We are using GAE
                    obs, returns, masks, actions, values, _, states, ep_infos, true_reward = runner.run(
                    )
                else:
                    obs, states, returns, masks, actions, values, ep_infos, true_reward = runner.run(
                    )

                ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(
                    obs, states, returns, masks, actions, values,
                    self.num_timesteps // (self.n_batch + 1), writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.dump_tabular()

                self.num_timesteps += self.n_batch + 1

            coord.request_stop()
            coord.join(enqueue_threads)

        return self
예제 #8
0
    def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0, ("The number of minibatches (`nminibatches`) "
                                                               "is not a factor of the total number of samples "
                                                               "collected per rollout (`n_batch`), "
                                                               "some samples won't be used."
                                                               )
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = self.runner.run()
                self.num_timesteps += self.n_batch
                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # non-recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num *
                                                                            self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer,
                                                                 update=timestep, cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num *
                                                                            self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep,
                                                                 writer=writer, states=mb_states,
                                                                 cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    total_episode_reward_logger(self.episode_reward,
                                                true_reward.reshape((self.n_envs, self.n_steps)),
                                                masks.reshape((self.n_envs, self.n_steps)),
                                                writer, self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("learning_rate", self.curr_lr)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
    def learn(self, total_timesteps, callback=None, seed=None,
              log_interval=1, tb_log_name="SAC", print_freq=100):

        with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
                info = {"cte": 0.0}
            else:
                obs = self.env.reset()
                info = {"cte": 0.0}

            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []

            # ---------------------load the trained NN for safety signal
            tf_obs = tf.placeholder(tf.float32, obs.shape)
            hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu)
            hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu)
            output = tf.layers.dense(hidden2, 2)
 
            sess = tf.Session()
            saver = tf.train.Saver()
            saver.restore(sess, "./saved_params/param02-level1-linear/safe_layer")
            # --------------------------------------------------------

            fr = open("dump_reward.txt", "w")
            fv = open("dump_violation.txt", "w")
            fl = open("dump_lambda.txt", "w")
            cum_reward = []
            num_vio = 0

            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                # ---------- use trained NN to revise the action
                #print("-------------------------------")
                print("h1, rescaled_action ", rescaled_action)
                proposed_action = rescaled_action.copy()
                proposed_action = np.asarray(proposed_action).reshape((1, 2))
                #print ("h2, proposed_action", proposed_action)
                print("obs shape: ", obs.shape)
                corr_v = sess.run(output, {tf_obs: obs})
                lambda_v = (info["cte"] + np.dot(corr_v, proposed_action.T) - 1.3) / np.dot(corr_v, corr_v.T)
                #print (info["cte"], info["cte"] + np.asscalar(np.dot(corr_v, proposed_action.T)) )
                print("lambda: ", lambda_v)    
                if lambda_v < 0:
                    lambda_v = 0.0
                proposed_action -= lambda_v * corr_v
                proposed_action *= np.abs(self.action_space.low)
                #print ("h3 proposed_action: ", proposed_action)
                #print("h4 rescaled_action: ", rescaled_action)
                rescaled_action[0] = proposed_action[0][0]
                rescaled_action[1] = proposed_action[0][1]
                # -----------------------------------------
                print("h5 rescaled_action: ", rescaled_action)

                new_obs, reward, done, new_info = self.env.step(rescaled_action)
                ep_len += 1
                if (len(cum_reward) == 10):
                    cum_reward.pop(0)
                cum_reward.append(reward)
                curr = 0.0
                for i in range(len(cum_reward)):
                    idx = len(cum_reward) - i - 1
                    curr += cum_reward[idx] * (0.99**i)
                fr.write("%f \n" %(curr))
                fv.write("%d \n" %(num_vio))
                fl.write("%f \n" %(lambda_v))

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, rescaled_action, reward, new_obs, float(done))
                obs = new_obs
                info = new_info

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, step)

                if ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done:
                    num_vio += 1
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
                

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)
        fr.close()
        fv.close()
        fl.close()
        return self
예제 #10
0
    def learn_jirl(self,
                   total_timesteps,
                   joystick=None,
                   callback=None,
                   seed=None,
                   log_interval=1,
                   tb_log_name="SAC",
                   print_freq=100,
                   base_policy=None,
                   stochastic_actor=True,
                   expert_guidance_steps=50000,
                   save_path=None):

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:
            # Add path to model in this function
            self._setup_learn(seed)

            # Joystick object
            js = JoyStick()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            episode_rewards = [0.0]

            # Reset the environment
            obs = self.env.reset()

            # Book keeping
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            n_crashes = 0
            infos_values = []
            mb_infos_vals = []
            pred_action_info = deque(maxlen=20)
            mean_info = deque(maxlen=50)
            std_info = deque(maxlen=50)
            throttle_info = deque(maxlen=1000)

            is_action_expert = False
            is_action_actor = True

            was_last_action_actor = False

            last_action_actor = None
            last_obs = None
            # steps in which expert takes control
            expert_control_steps = []
            state = {}  # for the imitation learning agent

            MAX_LEN = 10

            is_ratios_target_expert = deque(
                maxlen=MAX_LEN)  # IS ratios over the last few steps
            is_ratios_target_actor = deque(maxlen=MAX_LEN)

            EPS = 1e-10

            # Stats for plotting
            rew_per_step = []
            rew_per_step_rl = []
            rl_control = []

            # Buffer to control the threshold dynamically
            thresh_buffer = deque(maxlen=1000)
            std_buffer = deque(maxlen=10000)
            mean_buffer = deque(maxlen=10000)

            import time
            start_time = time.time()
            try:
                for step in range(total_timesteps):
                    # Compute current learning_rate
                    frac = 1.0 - step / total_timesteps
                    current_lr = self.learning_rate(frac)

                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break

                    # Get prediction from base policy
                    steerCmd = float(base_policy.predict(obs)[0][0])
                    #                 print("Steering from IL: ", steerCmd)
                    throttleCmd = -1
                    action_expert = [steerCmd, throttleCmd]
                    # mean_exp, std_exp = il_model.get_proba_actions(state)
                    # print(scipy.stats.multivariate_normal(mean = mean, cov = std).pdf(action_expert))

                    # Test with hard coded variance
                    # std_exp = [0.1, 0.1]
                    # proba_expert_policy = scipy.stats.norm(mean_exp[0], std_exp[0]).pdf(action_expert[0])
                    # proba_expert_policy = scipy.stats.norm(mean_exp[0], std_exp[0]).cdf(action_expert[0] + EPS) - scipy.stats.norm(mean_exp[0], std_exp[0]).cdf(action_expert[0] - EPS)
                    # if 2*np.pi*np.prod(std) <= 1:
                    #     proba_expert_policy = 2*np.pi*np.prod(std)*scipy.stats.multivariate_normal(mean = mean, cov = std).pdf(action_expert)
                    # else:
                    #     proba_expert_policy = scipy.stats.multivariate_normal(mean = mean, cov = std).pdf(action_expert)

                    ## ====== Test code snippet ======
                    # action_expert, _ = model.predict(obs, deterministic=True)
                    # new_obs, reward, done, info = self.env.step(action_expert)
                    ## ===============================

                    if not stochastic_actor:
                        action_actor = self.policy_tf.step(
                            obs[None], deterministic=True).flatten()
                    else:
                        action_actor = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()

                    if step >= expert_guidance_steps:
                        action_actor = self.policy_tf.step(
                            obs[None], deterministic=True).flatten()

                    mean_act, std_act = self.policy_tf.proba_step(obs[None])
                    # print(scipy.stats.multivariate_normal(mean = mean.flatten(), cov = std.flatten()).pdf(action_actor))

                    proba_actor_policy = scipy.stats.norm(
                        mean_act.flatten()[0],
                        std_act.flatten()[0]).pdf(action_actor[0])

                    proba_expert_policy = scipy.stats.norm(
                        mean_act.flatten()[0],
                        std_act.flatten()[0]).pdf(action_expert[0])
                    # proba_actor_policy = scipy.stats.norm(mean_act.flatten()[0], std_act.flatten()[0]).cdf(action_actor[0] + EPS) - scipy.stats.norm(mean_act.flatten()[0], std_act.flatten()[0]).cdf(action_actor[0] - EPS)
                    # if 2*np.pi*np.prod(std) <= 1:
                    #     proba_actor_policy = 2*np.pi*np.prod(std.flatten())*scipy.stats.multivariate_normal(mean = mean.flatten(), cov = std.flatten()).pdf(action_actor)
                    # else:
                    #     proba_actor_policy = scipy.stats.multivariate_normal(mean = mean.flatten(), cov = std.flatten()).pdf(action_actor)
                    # Update entropy buffer
                    std_buffer.append(std_act)
                    # Update mean difference buffer
                    mean_buffer.append(np.linalg.norm(mean_act -
                                                      action_expert))
                    # mean_buffer.append(np.linalg.norm(action_actor - action_expert))
                    rho = round(float(step) / expert_guidance_steps, 2)
                    # THRESH = (1 - rho) * (scipy.stats.norm(0, 0.1).pdf(0) - 1.0)**MAX_LEN
                    # _THRESH = (1 - rho) * (scipy.stats.norm(0, 0.1).pdf(0) - 2.0)

                    _THRESH = (np.mean(std_buffer) +
                               np.mean(mean_buffer)) * (1 - rho)
                    THRESH = _THRESH**MAX_LEN

                    if step >= expert_guidance_steps:
                        # Only let the RL control the car
                        # If this doesn't work, tune MAX_LEN
                        THRESH = _THRESH = 0

                    if js.is_on():
                        ## =====================================
                        ## MANUAL CONTROL
                        ## =====================================
                        # Execute commands from the joystick in the environment
                        action_js = [js.get_steer(), -1]
                        new_obs, reward, done, info = self.env.step(action_js)

                        # Store transition in the replay buffer.
                        self.replay_buffer.add(obs, action_js, reward, new_obs,
                                               float(done))

                        ## ==========================================
                        sigma_p = 0.01
                        reward_hat = reward * np.exp(
                            -np.linalg.norm(action_actor - action_js) /
                            sigma_p)
                        self.replay_buffer.add(obs, action_actor, reward_hat,
                                               new_obs, float(done))
                        ## ==========================================

                        if was_last_action_actor:
                            # Train the actor when the expert's actions are executed
                            # mb_infos_vals = self.optimize(step, writer, current_lr)
                            penalty = -1  #-10
                            self.replay_buffer.add(last_obs, last_action_actor,
                                                   penalty, obs, float(done))
                            is_ratios_target_expert = deque(maxlen=MAX_LEN)
                            was_last_action_actor = False
                            last_action_actor = None
                            last_obs = None

                        is_action_actor = False

                        # print("Actor IS ratio: ", is_ratio)

                        # if ep_len > 700:
                        #     print("Expert: ", np.prod(is_ratios_target_actor))
                        if (len(is_ratios_target_actor) == MAX_LEN) and np.all(
                            [(p > _THRESH) for p in is_ratios_target_actor]):
                            # Switch control to actor in the next step
                            is_action_actor = True

                        rew_per_step_rl.append(0.0)
                        rl_control.append(0)

    #                 else:
                    elif is_action_actor:
                        ## =====================================
                        ## RL CONTROL
                        ## =====================================
                        # Execute actor's actions in the environment
                        new_obs, reward, done, info = self.env.step(
                            action_actor)

                        # Update IS ratiowill need to
                        is_ratio = self.importance_sampling_ratio(
                            1.0, proba_expert_policy)
                        is_ratios_target_expert.append(is_ratio)

                        # Store transition in the replay buffer.
                        self.replay_buffer.add(obs, action_actor, reward,
                                               new_obs, float(done))

                        if not was_last_action_actor:
                            is_ratios_target_actor = deque(maxlen=MAX_LEN)

                        is_action_actor = True

                        # print("Actor: ", np.prod(is_ratios_target_expert))
                        # Per step safety check
                        if is_ratio < _THRESH:
                            # Switch control to the expert
                            is_action_actor = False

                        # Safe ty check for a sequence of states
                        if (len(is_ratios_target_actor) == MAX_LEN) and np.all(
                            [(p > _THRESH) for p in is_ratios_target_actor]):
                            #if (len(is_ratios_target_expert) == MAX_LEN) and (np.prod(is_ratios_target_expert) <= THRESH):
                            # Switch control to expert in the next step
                            is_action_actor = False

                        was_last_action_actor = True
                        last_action_actor = action_actor
                        last_obs = obs

                        rew_per_step_rl.append(reward)
                        rl_control.append(1)

                    else:
                        ## =======================================
                        ## EXPERT CONTROL
                        ## =======================================
                        # Execute expert action in the environment
                        new_obs, reward, done, info = self.env.step(
                            action_expert)
                        # Update IS ratio
                        # is_ratio = self.importance_sampling_ratio(1.0, proba_actor_policy)
                        is_ratio = self.importance_sampling_ratio(
                            1.0, proba_expert_policy)
                        is_ratios_target_actor.append(is_ratio)

                        # print("Expert ", is_ratio)

                        # Store transition in the replay buffer.
                        self.replay_buffer.add(obs, action_expert, reward,
                                               new_obs, float(done))

                        ## ==========================================
                        # # NOTE: Figure out what's going wrong here
                        # # Without the penalized reward the policy diverges (mean doesn't go towards 0
                        # # Also test with stochastic actions from the RL policy
                        # # # Add penalized reward to actor's action
                        # # r_hat: penalized reward
                        sigma_p = 0.01
                        reward_hat = reward * np.exp(
                            -np.linalg.norm(action_actor - action_expert) /
                            sigma_p)
                        self.replay_buffer.add(obs, action_actor, reward_hat,
                                               new_obs, float(done))
                        ## ==========================================

                        if was_last_action_actor:
                            # Train the actor when the expert's actions are executed
                            # mb_infos_vals = self.optimize(step, writer, current_lr)
                            penalty = -1  #-10
                            self.replay_buffer.add(last_obs, last_action_actor,
                                                   penalty, obs, float(done))
                            is_ratios_target_expert = deque(maxlen=MAX_LEN)
                            was_last_action_actor = False
                            last_action_actor = None
                            last_obs = None

                        is_action_actor = False

                        # print("Actor IS ratio: ", is_ratio)

                        # if ep_len > 700:
                        #     print("Expert: ", np.prod(is_ratios_target_actor))

                        #                         if (len(is_ratios_target_actor) == MAX_LEN) and (np.prod(is_ratios_target_actor) > THRESH):
                        if (len(is_ratios_target_actor) == MAX_LEN) and np.all(
                            [(p > _THRESH) for p in is_ratios_target_actor]):
                            # Switch control to actor in the next step
                            is_action_actor = True

                        rew_per_step_rl.append(0.0)
                        rl_control.append(0)

                    throttle_info.append(float(self.env.last_throttle))
                    rew_per_step.append(reward)

                    pred_action_info.append(
                        np.abs(action_actor[0] - action_expert[0]))
                    # mean_info.append([mean_exp[0], mean_act.flatten()[0]])
                    # std_info.append([std_exp[0], std_act.flatten()[0]])

                    ep_len += 1
                    obs = new_obs

                    if ep_len % 400 == 0:
                        print("Mean error pred actions: {}".format(
                            np.mean(pred_action_info)))
                        print("Mean difference: {}".format(
                            np.mean(mean_buffer)))
                        print("Mean std: {}".format(np.mean(std_buffer)))
                        # print("Mean: ", [np.mean([x[0] for x in mean_info]), np.mean([x[1] for x in mean_info])])
                        # print("Std: ", [np.mean([x[0] for x in std_info]), np.mean([x[1] for x in std_info])])
                        # print(np.prod(is_ratios_target_actor))

                    # Train every step  ---under consideratioon
                    if (ep_len % 400) == 0:
                        self.env.jet.apply_throttle(0)
                        mb_infos_vals = self.optimize(step, writer, current_lr)

    #                 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
    #                     print("{} steps".format(ep_len))

    # Retrieve reward and episode length if using Monitor wrapper
                    maybe_ep_info = info.get('episode')
                    if maybe_ep_info is not None:
                        ep_info_buf.extend([maybe_ep_info])

                    if writer is not None:
                        # Write reward per episode to tensorboard
                        ep_reward = np.array([reward]).reshape((1, -1))
                        ep_done = np.array([done]).reshape((1, -1))
                        self.episode_reward = total_episode_reward_logger(
                            self.episode_reward, ep_reward, ep_done, writer,
                            step)

                    episode_rewards[-1] += reward

                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                    if len(episode_rewards[-101:-1]) == 0:
                        mean_reward = -np.inf
                    else:
                        mean_reward = round(
                            float(np.mean(episode_rewards[-101:-1])), 1)

                    if len(rl_control) < 1000:
                        mean_rl_control = round(
                            100 * float(np.mean(rl_control)), 3)
                    else:
                        mean_rl_control = round(
                            100 * float(np.mean(rl_control[-1001:-1])), 3)

                    num_episodes = len(episode_rewards)

                    if self.verbose >= 1 and (ep_len % 400) == 0:
                        logger.logkv("episodes", num_episodes)
                        logger.logkv("mean 100 episode reward", mean_reward)
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                        logger.logkv("n_updates", self.n_updates)
                        logger.logkv("current_lr", current_lr)
                        logger.logkv("mean RL control percent",
                                     mean_rl_control)
                        logger.logkv("mean of throttle values",
                                     mean(throttle_info))
                        logger.logkv("time elapsed",
                                     int(time.time() - start_time))
                        #logger.logkv("n_crashes", n_crashes)
                        if len(infos_values) > 0:
                            for (name, val) in zip(self.infos_names,
                                                   infos_values):
                                logger.logkv(name, val)
                        logger.logkv("total timesteps", step)
                        logger.dumpkvs()
                        # Reset infos:
                        infos_values = []
            except KeyboardInterrupt:
                print("Exiting")
                self.env.reset()
                import sys
                sys.exit(0)

            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)

            # save stats
            np.save(save_path + '/episode_reward', episode_rewards)
            np.save(save_path + '/stepwise_reward', rew_per_step)
            np.save(save_path + '/stepwise_reward_rl', rew_per_step_rl)
            print("Saving complete. Give a keyboard interrupt to end")
        return self
예제 #11
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # runner = DistributedRunner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)

            ctx = multiprocessing.get_context('spawn')
            q = ctx.Queue()
            p = ctx.Process(
                target=runDRunner, kwargs={'examples_queue': q}
            )  #, 'env' : self.env, 'model' : self, 'n_steps' : self.n_steps, 'gamma' : self.gamma, 'lam':self.lam})
            p.start()
            print("STarted up queue from master")

            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            print("about to run...", n_updates, "updates and batch size",
                  self.n_batch)
            for update in range(1, n_updates + 1):
                print("In loop.")
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount

                # pull from queue
                print("Pulling from quee...")
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = q.get(
                    block=True)
                print("Got something!")

                self.num_timesteps += self.n_batch
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []

                #non-recurrent version
                update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                inds = np.arange(self.n_batch)
                for epoch_num in range(self.noptepochs):
                    np.random.shuffle(inds)
                    for start in range(0, self.n_batch, batch_size):
                        timestep = self.num_timesteps // update_fac + (
                            (self.noptepochs * self.n_batch +
                             epoch_num * self.n_batch + start) // batch_size)
                        end = start + batch_size
                        mbinds = inds[start:end]
                        slices = (arr[mbinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mb_loss_vals.append(
                            self._train_step(lr_now,
                                             cliprange_now,
                                             *slices,
                                             writer=writer,
                                             update=timestep,
                                             cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                ## BRODCAST WEIGHTS

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
예제 #12
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope(
                        "kfac_apply",
                        reuse=self.trained,
                        custom_getter=tf_util.outer_scope_getter(
                            "kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized) if not f
                    ]

                    self.train_op, self.q_runner = self.optim.apply_gradients(
                        list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized)
                        if not f and v not in old_uninitialized_vars
                    ]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(
                            tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess,
                                                               coord=coord,
                                                               start=True)
            else:
                enqueue_threads = []

            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()

                # pytype:disable=bad-unpacking
                # true_reward is the reward without discount
                if isinstance(self.runner, PPO2Runner):
                    # We are using GAE
                    rollout = self.runner.run(callback)
                    obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout
                else:
                    rollout = self.runner.run(callback)
                    obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout
                # pytype:enable=bad-unpacking

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(
                    obs, states, returns, masks, actions, values,
                    self.num_timesteps // (self.n_batch + 1), writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.dump_tabular()

            coord.request_stop()
            coord.join(enqueue_threads)

        callback.on_training_end()
        return self
예제 #13
0
 def reward_callback(local, _):
     nonlocal eprewmeans
     eprewmeans.append(
         safe_mean([ep_info["r"] for ep_info in local["ep_info_buf"]]))
예제 #14
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="TD3",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        last_replay_update = 0

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []
            self.active_sampling = False
            initial_step = self.num_timesteps

            if self.buffer_is_prioritized and \
                    ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer")
                     or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer")) \
                    and self.num_timesteps >= self.prioritization_starts:
                self._set_prioritized_buffer()

            for step in range(initial_step, total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(obs[None]).flatten()
                    # Add noise to the action, as the policy
                    # is deterministic, this is required for exploration
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(
                    obs, action, reward, new_obs,
                    float(done if not self.time_aware else done
                          and info["termination"] != "steps"))
                obs = new_obs

                if ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "RankPrioritizedReplayBuffer")\
                        or self.replay_buffer.__name__ == "RankPrioritizedReplayBuffer") and \
                        self.num_timesteps % self.buffer_size == 0:
                    self.replay_buffer.rebalance()

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                                or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - self.num_timesteps / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        # Note: the policy is updated less frequently than the Q functions
                        # this is controlled by the `policy_delay` parameter
                        step_writer = writer if grad_step % self.write_freq == 0 else None
                        mb_infos_vals.append(
                            self._train_step(step, step_writer, current_lr,
                                             (step + grad_step) %
                                             self.policy_delay == 0))

                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if isinstance(self.replay_buffer, DiscrepancyReplayBuffer
                                  ) and n_updates - last_replay_update >= 5000:
                        self.replay_buffer.update_priorities()
                        last_replay_update = n_updates
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        if self.active_sampling:
                            sample_obs, sample_state = self.env.get_random_initial_states(
                                25)
                            obs_discrepancies = self.policy_tf.get_q_discrepancy(
                                sample_obs)
                            obs = self.env.reset(
                                **sample_state[np.argmax(obs_discrepancies)])
                        else:
                            obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1

                if self.buffer_is_prioritized and \
                        ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer")
                         or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer"))\
                        and self.num_timesteps >= self.prioritization_starts:
                    self._set_prioritized_buffer()

                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #15
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="SAC",
              print_freq=100):

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
                info = {"cte": 0.0}
            else:
                obs = self.env.reset()
                info = {"cte": 0.0}

            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []

            # ---------------------load the trained NN for safety signal

            tf_obs = tf.placeholder(tf.float32, shape=(1, 104))

            hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu)
            hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu)
            output1 = tf.layers.dense(hidden2, 2)

            hidden3 = tf.layers.dense(tf_obs, 64, tf.nn.relu)
            hidden4 = tf.layers.dense(hidden3, 16, tf.nn.relu)
            output2 = tf.layers.dense(hidden4, 3)

            sess = tf.Session()
            saver = tf.train.Saver()
            saver.restore(sess,
                          "./saved_params/param03-level1-quad/safe_layer")

            # --------------------------------------------------------

            fr = open("dump_reward.txt", "w")
            fv = open("dump_violation.txt", "w")
            cum_reward = []
            num_vio = 0

            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                # ---------- use trained NN to revise the action
                if action[1] < 0:
                    action[1] *= -1
                print("h1, action ", action)
                proposed_action = action.copy()
                action_take = action.copy()
                proposed_action = np.asarray(proposed_action).reshape((1, 2))
                #print ("h2, proposed_action", proposed_action)

                #print("obs shape", obs.shape)
                v1 = sess.run(output1, {tf_obs: obs.reshape((1, 104))})

                v2 = sess.run(output2, {tf_obs: obs.reshape((1, 104))})
                q = [v2[0][0], 0.5 * v2[0][1], 0.5 * v2[0][1], v2[0][2]]
                q = np.reshape(q, (2, 2))

                x = cvx.Variable(1, 2)
                obj = cvx.sum_squares(x - proposed_action)
                cons = [info["cte"] + v1 * x.T + x * q * x.T <= 4.8, x[1] > 0]
                prob = cvx.Problem(cvx.Minimize(obj), cons)

                try:
                    qcqp = QCQP(prob)
                    qcqp.suggest(SDR)
                    f_cd, v_cd = qcqp.improve(COORD_DESCENT)
                    print(
                        "Coordinate descent: objective %.3f, violation %.3f" %
                        (f_cd, v_cd))

                    if v_cd == 0:
                        new_action = x.value
                        new_action = np.asarray(new_action).reshape((1, 2))
                        print("h5, action ", new_action)
                        action_take[0] = new_action[0][0]
                        action_take[1] = new_action[0][1]
                        new_obs, reward, done, new_info = self.env.step(
                            action_take)
                        action = action_take
                    else:
                        new_obs, reward, done, new_info = self.env.step(action)

                except:
                    new_obs, reward, done, new_info = self.env.step(action)
#             -----------------------------------------

                ep_len += 1

                if (len(cum_reward) == 10):
                    cum_reward.pop(0)
                cum_reward.append(reward)
                curr = 0.0
                for i in range(len(cum_reward)):
                    idx = len(cum_reward) - i - 1
                    curr += cum_reward[idx] * (0.99**i)
                fr.write("%f \n" % (curr))
                fv.write("%d \n" % (num_vio))

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs
                info = new_info

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer, step)

                if ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done:
                    num_vio += 1
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(
                        episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)
        return self
예제 #16
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name,
                               new_tb_log) as writer:
            # self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))
            self.total_episode_reward = np.zeros((1, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()
            n_timesteps = 0
            # nupdates = total_timesteps // self.n_batch
            for timestep in range(1, total_timesteps + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - timestep / total_timesteps
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                n_timesteps += len(obs)
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprangenow,
                                    *slices,
                                    writer=writer,
                                    update=n_timesteps,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for stan_timestepsrt in range(0, self.n_envs,
                                                      envs_per_batch):
                            # timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) //
                            #             envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=n_timesteps,
                                                 writer=writer,
                                                 states=mb_states))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                # if writer is not None:
                #     self.episode_reward = total_episode_reward_logger(self.episode_reward,
                #                                                       true_reward.reshape((self.n_envs, self.n_steps)),
                #                                                       masks.reshape((self.n_envs, self.n_steps)),
                #                                                       writer, n_timesteps)

                if self.verbose >= 1 and (timestep % log_interval == 0
                                          or timestep == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("total_timesteps", n_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()
                    self.total_episode_reward = runner.total_episode_reward

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                if n_timesteps > total_timesteps:
                    break

            return self
예제 #17
0
 def reward_callback(local, _):
     nonlocal eprewmeans
     eprewmeans.append(
         safe_mean([ep_info['r'] for ep_info in local['ep_info_buf']]))
예제 #18
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="SAC",
              print_freq=100,
              save_path=None):

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
            else:
                obs = self.env.reset()

            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []
            model_path = "--Path to model--/myNewModdel.h5"
            # model_path=  None
            if model_path is not None:
                cfg = dk.load_config(
                    config_path='--Path to config file inside mycar/config.py')
                kl = KerasLinear()
                kl.load(model_path)
                # vae = self.env.get_vae()
            self.training_started = False
            self.start_training = False
            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts and not self.training_started:
                    if model_path is not None:
                        try:
                            img_arr = self.env.get_images()
                            # print(img_arr[0].shape)
                            img_arr = np.asarray(img_arr[0])
                            img_arr = normalize_and_crop(img_arr, cfg)
                            croppedImgH = img_arr.shape[0]
                            croppedImgW = img_arr.shape[1]
                            if img_arr.shape[2] == 3 and cfg.IMAGE_DEPTH == 1:
                                img_arr = dk.utils.rgb2gray(img_arr).reshape(
                                    croppedImgH, croppedImgW, 1)
                            steering, throttle = kl.run(img_arr)
                            action = [steering, throttle / 6.0]
                            action = np.asarray(action)
                            # rescaled_action = action * np.abs(self.action_space.low)
                            rescaled_action = action
                            print('Predicted action :', action)
                        except Exception as e:
                            print(e)
                            action = self.env.action_space.sample()
                            rescaled_action = action
                    else:
                        action = self.env.action_space.sample()
                        rescaled_action = action
                        print(action)
                    # No need to rescale when sampling random action
                elif not self.training_started:
                    self.start_training = True
                    obs = self.env.reset()
                else:

                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)
                ep_len += 1

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer, step)

                if ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done or self.start_training:
                    self.start_training = False
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(
                        episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)

            plt.figure(1)
            plt.plot(episode_rewards)
            plt.title('Episode Rewards')
            plt.ylabel("Reward")
            plt.xlabel("Epoch")
            filename = "training" + str(random.random()) + ".png"
            plt.savefig(filename)
            plt.show()
        return self
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="SAC",
              print_freq=100):

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
            else:
                obs = self.env.reset()

            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []

            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)
                ep_len += 1

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer, step)

                if ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done:
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(
                        episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)
        return self
예제 #20
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            t_start = time.time()
            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # unpack
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                _, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values,
                    self.num_timesteps // self.n_batch, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.dump_tabular()

        callback.on_training_end()
        return self
예제 #21
0
파일: idac.py 프로젝트: zhougroup/IDAC
    def learn(self, total_timesteps, env_eval, callback=None, seed=None, path=None, dis_path=None, score_path=None,
              dis_eval_interval=100, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None):

        self.eval_env = env_eval
        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            dis_eval_array = []  # (total_step % eval_intervel) x 2 x n_batch
            self.ep_length = 0

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                    or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                    noise = np.zeros(self.noise_dim)
                else:            
                    noise = self.policy_tf.gen_noise(obs[None]).flatten()
                    action = self.policy_tf.step(obs[None],noise[None] ,deterministic=False).flatten()

                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)
                self.ep_length += 1

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs, float(done), noise)

                episode_rewards[-1] += reward
                reset_flag = done or self.ep_length >= self.max_ep_length
                if reset_flag:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    obs = self.env.reset()
                    episode_rewards.append(0.0)
                    self.ep_length = 0

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                else:
                    obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)

                        mb_infos_vals.append(self._train_step(step, writer, current_lr, dis_eval_array,
                                                              dis_eval_interval, dis_path))
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                if self.num_timesteps % 2000 == 0:
                    eval_ob = self.eval_env.reset()
                    eval_epi_rewards = 0
                    eval_epis = 0
                    eval_performance = []
                    eval_ep_step = 0
                    while True:
                        eval_noise = self.policy_tf.gen_noise(eval_ob[None]).flatten()
                        eval_action = self.policy_tf.step(eval_ob[None], eval_noise[None], deterministic=True).flatten()
                        eval_rescaled_action = eval_action * np.abs(self.action_space.low)
                        eval_new_obs, eval_reward, eval_done, eval_info = self.eval_env.step(eval_rescaled_action)
                        eval_epi_rewards += eval_reward
                        eval_ob = eval_new_obs
                        eval_ep_step += 1
                        if eval_done or eval_ep_step >= self.max_ep_length:
                            eval_ob = self.eval_env.reset()
                            eval_performance.append(eval_epi_rewards)
                            eval_epi_rewards = 0
                            eval_epis += 1
                            eval_ep_step = 0
                            if eval_epis > 5:
                                break
                    with open(score_path, 'a') as f2:
                        f2.write("%i %f\n" % (self.num_timesteps, np.mean(eval_performance)))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and reset_flag and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    with open(path,'a') as f1:
                        f1.write("%f " % step)
                        f1.write("%f " % mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                        with open(path,'a') as f1:
                            f1.write("%f " % safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                            f1.write("%f " % safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    with open(path,'a') as f1:
                        f1.write("%f " % n_updates)                    
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)

                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []

            return self
예제 #22
0
    def learn(self, total_timesteps, callback=None,
              log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None,use_action_repeat = False,poisson=False):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        self.use_action_repeat=use_action_repeat
        # self.action_repetition = 0.8
        self.running_action_repetition = self.action_repetition
        self.poisson=poisson
        self.poisson_action = 4
        self.poisson_mean = 4
        prev_action = None
        # self.prob_past = 0.6
            #self.env.act_rep-=(21-4)/float(total_timesteps)
        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            # if(poisson):
            #     np.concatenate((obs,))
                
            # print(obs)
            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []
            self.num_timesteps=0

            for step in range(total_timesteps):
                if poisson:
                    if(self.poisson_mean<1):
                        self.poisson_mean=1

                    self.poisson_action = int(np.random.poisson(self.poisson_mean))

                    self.poisson_mean-=((5)/float(total_timesteps))
                    if(self.poisson_action<1):
                        self.poisson_action=1
                if use_action_repeat:
                    # self.action_repetition-=((0.9)/float(total_timesteps))
                    amount = ((4)/float(total_timesteps))
                    self.running_action_repetition -= amount
                    # print("Action repetition is :{}".format(self.action_repetition))
                    if(self.running_action_repetition<=2 and self.running_action_repetition>1):
                        # if(self.action_repetition==4):
                            # print("Flushing replay buffer 4, {}".format(self.action_repetition))
                            # self.replay_buffer = ReplayBuffer(self.buffer_size)
                        self.action_repetition=2
                    if(self.running_action_repetition<=1):
                        # if(self.action_repetition==2):
                            # print("Flushing replay buffer 2, {}".format(self.action_repetition))
                            # self.replay_buffer = ReplayBuffer(self.buffer_size)
                        self.action_repetition=1
                        
                    # self.action_repetition = (self.action_repetition*amount +self.action_repetition-amount)/(1-amount+amount*self.action_repetition)
                    # if(self.action_repetition<0):
                    #     self.action_repetition=0
                    # self.env.dec_act_rep((21-4)/float(total_timesteps))
                    # self.running_action_repetition -= ((6-1)/float(total_timesteps))
                
                    # self.action_repetition = int(self.running_action_repetition)
                    # if(self.action_repetition<1):
                    #     self.action_repetition=1
                    
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                    or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    if poisson:
                        action = self.policy_tf.step(np.concatenate((obs,np.array([self.poisson_action])))[None], deterministic=False).flatten()
                    else:    
                        action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                # if use_action_repeat and prev_action is not None:
                #     if(np.random.uniform(0,1)<self.action_repetition):
                #         rescaled_action=prev_action
                
                assert action.shape == self.env.action_space.shape
                
                # Add action repetition
                # print("Action repetition is {}".format(self.action_repetition))
                if self.use_action_repeat: 
                    repeated_reward = 0
                    # print("Repeating actions for: {}".format(int(rescaled_action[-1])+4))

                    for repeat_step in range(int(rescaled_action[-1])+4):
                        prev_action = rescaled_action
                        new_obs, reward, done, info = self.env.step(rescaled_action[:len(rescaled_action)-1])
                        repeated_reward+=reward
                        buffer_action = action.copy()
                        buffer_action[-1] = (rescaled_action[-1]+4-int(rescaled_action[-1]+4))+repeat_step+1 - 4
                        # print("Sub actions for: {}".format(buffer_action[-1]))

                        # Add extra supervision
                        # self.replay_buffer.add(obs, action, repeated_reward, new_obs, float(done))

                        if done:
                            break
                    reward = repeated_reward
                elif poisson:
                    repeated_reward = 0
                    # print("Poisson repetition is {}".format(self.poisson_action))
                    for _ in range(self.poisson_action):
                        # print("Repeating actions for: {}".format(self.action_repetition))
                        prev_action = rescaled_action
                        new_obs, reward, done, info = self.env.step(rescaled_action)
                        repeated_reward+=reward
                        if done:
                            break
                    reward = repeated_reward
                    
                else:
                    new_obs, reward, done, info = self.env.step(rescaled_action)
                
                # Store transition in the replay buffer.
                if poisson:
                    self.replay_buffer.add(np.concatenate((obs,np.array([self.poisson_action]))), action, reward, np.concatenate((new_obs,np.array([self.poisson_action]))), float(done))
                else:   
                    self.replay_buffer.add(obs, action, reward, new_obs, float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    prev_action=None
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #23
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(unscaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv("episode reward", episode_rewards[-2])
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)
            if self.priority_buffer:
                self.replay_buffer.set_model(self)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)
            self.env_id = self.env.env.get_attr('spec')[0].id

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            store_time = 0.0
            step_time = 0.0
            train_time = 0.0
            episode_rewards = [[0.0] for _ in range(self.env.env.num_envs)]
            episode_successes = [[] for _ in range(self.env.env.num_envs)]
            if self.action_noise is not None:
                self.action_noise.reset()
            assert isinstance(self.env.env, VecEnv)
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []
            pp_sr_buf = deque(maxlen=5)
            stack_sr_buf = deque(maxlen=5)
            start_decay = total_timesteps
            if self.sequential and 'FetchStack' in self.env_id:
                current_max_nobject = 2
                self.env.env.env_method('set_task_array',
                                        [[(2, 0), (2, 1),
                                          (1, 0)]] * self.env.env.num_envs)
                print('Set task_array to ',
                      self.env.env.get_attr('task_array')[0])
                self.env.env.env_method('set_random_ratio',
                                        [0.7] * self.env.env.num_envs)
            obs = self.env.reset()
            print(obs.shape)
            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.curriculum and step % 3000 == 0:
                    if 'FetchStack' in self.env.env.get_attr('spec')[0].id:
                        # Stacking
                        pp_sr = eval_model(
                            self.eval_env,
                            self,
                            current_max_nobject if self.sequential else
                            self.env.env.get_attr('n_object')[0],
                            1.0,
                            init_on_table=(self.env.env.get_attr('spec')[0].id
                                           == 'FetchStack-v2'))
                        pp_sr_buf.append(pp_sr)
                        stack_sr = eval_model(
                            self.eval_env,
                            self,
                            current_max_nobject if self.sequential else
                            self.env.env.get_attr('n_object')[0],
                            0.0,
                            init_on_table=(self.env.env.get_attr('spec')[0].id
                                           == 'FetchStack-v2'))
                        stack_sr_buf.append(stack_sr)
                        print('Pick-and-place success rate',
                              np.mean(pp_sr_buf))
                        if self.sequential:
                            if self.env.env.get_attr('random_ratio')[
                                    0] > 0.5 and np.mean(pp_sr_buf) > 0.8:
                                _ratio = 0.3
                            elif self.env.env.get_attr('random_ratio')[0] < 0.5 \
                                    and current_max_nobject < self.env.env.get_attr('n_object')[0] \
                                    and np.mean(stack_sr_buf) > 1 / current_max_nobject:
                                _ratio = 0.7
                                current_max_nobject += 1
                                previous_task_array = self.env.env.get_attr(
                                    'task_array')[0]
                                self.env.env.env_method(
                                    'set_task_array', [
                                        previous_task_array +
                                        [(current_max_nobject, j)
                                         for j in range(current_max_nobject)]
                                    ] * self.env.env.num_envs)

                                print('Set task_array to',
                                      self.env.env.get_attr('task_array')[0])
                            else:
                                _ratio = self.env.env.get_attr(
                                    'random_ratio')[0]
                        else:
                            if start_decay == total_timesteps and np.mean(
                                    pp_sr_buf) > 0.8:
                                start_decay = step
                            _ratio = np.clip(0.7 - (step - start_decay) / 2e6,
                                             0.3, 0.7)  # from 0.7 to 0.3
                    elif 'FetchPushWallObstacle' in self.env_id:
                        _ratio = max(1.0 - step / total_timesteps, 0.0)
                    else:
                        raise NotImplementedError
                    self.env.env.env_method('set_random_ratio',
                                            [_ratio] * self.env.env.num_envs)
                    print('Set random_ratio to',
                          self.env.env.get_attr('random_ratio')[0])

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    rescaled_action = np.stack([
                        self.env.action_space.sample()
                        for _ in range(self.env.env.num_envs)
                    ],
                                               axis=0)
                    action = rescaled_action
                else:
                    action = self.policy_tf.step(obs, deterministic=False)
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == (
                    self.env.env.num_envs, ) + self.env.action_space.shape

                step_time0 = time.time()
                new_obs, reward, done, info = self.env.step(rescaled_action)
                step_time += time.time() - step_time0

                next_obs = new_obs.copy()
                for idx, _done in enumerate(done):
                    if _done:
                        next_obs[idx] = self.env.convert_dict_to_obs(
                            info[idx]['terminal_observation'])

                # Store transition in the replay buffer.
                store_time0 = time.time()
                self.replay_buffer.add(obs, action, reward, next_obs, done)
                store_time += time.time() - store_time0
                obs = new_obs
                for idx, _done in enumerate(done):
                    episode_rewards[idx][-1] += reward[idx]
                    if _done:
                        episode_rewards[idx].append(0.0)
                        maybe_is_success = info[idx].get('is_success')
                        if maybe_is_success is not None:
                            episode_successes[idx].append(
                                float(maybe_is_success))

                # Retrieve reward and episode length if using Monitor wrapper
                for _info in info:
                    maybe_ep_info = _info.get('episode')
                    if maybe_ep_info is not None:
                        ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.reshape(reward, (self.env.env.num_envs, -1))
                    ep_done = np.reshape(done, (self.env.env.num_envs, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                train_time0 = time.time()
                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                train_time += time.time() - train_time0
                if len(episode_rewards[0][-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(
                            np.mean(
                                np.concatenate([
                                    episode_rewards[i][-101:-1]
                                    for i in range(self.env.env.num_envs)
                                ]))), 1)

                num_episodes = sum([
                    len(episode_rewards[i])
                    for i in range(len(episode_rewards))
                ])
                self.num_timesteps += self.env.env.num_envs
                # Display training infos
                if self.verbose >= 1 and done[
                        0] and log_interval is not None and len(
                            episode_rewards[0]) % (log_interval //
                                                   self.env.env.num_envs) == 0:
                    fps = int(self.num_timesteps / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes[0]) > 0:
                        logger.logkv(
                            "success rate",
                            np.mean(
                                np.concatenate([
                                    episode_successes[i][-100:]
                                    for i in range(self.env.env.num_envs)
                                ])))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    if hasattr(self.eval_env.unwrapped, 'random_ratio'):
                        logger.logkv("random_ratio",
                                     self.env.env.get_attr('random_ratio')[0])
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self