Exemplo n.º 1
0
class PPOAgent():
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.r_dim = args.r_dim

        self.lr = args.lr
        self.gamma_e = args.gamma_e
        self.gamma_i = args.gamma_i
        self.lamda = args.lamda
        self.entropy_coef = args.entropy_coef
        self.ex_coef = args.ex_coef
        self.in_coef = args.in_coef
        self.clip_eps = args.clip_eps
        self.update_epoch = args.update_epoch
        self.batch_size = args.batch_size
        self.initialize_episode = args.initialize_episode
        self.update_proportion = args.update_proportion
        self.rollout_len = args.rollout_len
        self.obs_clip = args.obs_clip

        self.device = torch.device(args.device)

        self.actor_critic = CNNActorCritic(in_channel=self.o_dim[0],
                                           a_dim=self.a_dim).to(self.device)
        self.RND = RNDNetwork(in_channel=1).to(self.device)

        self.optimizer = optim.Adam(list(self.actor_critic.parameters()) +
                                    list(self.RND.predictor.parameters()),
                                    lr=self.lr)

        self.buffer = Buffer(capacity=self.rollout_len, o_dim=self.o_dim)

        self.normalizer_obs = Normalizer(shape=self.o_dim, clip=self.obs_clip)
        self.normalizer_ri = Normalizer(shape=1, clip=np.inf)

    def choose_action(self, obs):
        obs = torch.from_numpy(obs).float().to(self.device) / 255.
        with torch.no_grad():
            action_logits = self.actor_critic.act(obs)

        dist = Categorical(action_logits)
        action = dist.sample()
        log_prob = dist.log_prob(action)

        action, log_prob = action.cpu().detach().numpy(), log_prob.cpu(
        ).detach().numpy()
        return action, log_prob

    def compute_intrinsic_reward(self, obs_):
        obs_ = self.normalizer_obs.normalize(obs_)
        obs_ = torch.from_numpy(obs_[:, 3:, :, :]).float().to(self.device)
        with torch.no_grad():
            pred_feature, tar_feature = self.RND(obs_)
        reward_in = F.mse_loss(pred_feature, tar_feature,
                               reduction='none').mean(dim=-1)
        reward_in = reward_in.cpu().detach().numpy()
        return reward_in

    def GAE_caculate(self, rewards, masks, values, gamma, lamda):
        returns = np.zeros(shape=len(rewards), dtype=np.float32)
        deltas = np.zeros(shape=len(rewards), dtype=np.float32)
        advantages = np.zeros(shape=len(rewards), dtype=np.float32)

        pre_return = 0.
        pre_advantage = 0.
        pre_value = 0.
        for i in reversed(range(len(rewards))):
            returns[i] = rewards[i] + masks[i] * gamma * pre_return
            deltas[i] = rewards[i] + masks[i] * gamma * pre_value - values[i]
            advantages[i] = deltas[i] + gamma * lamda * pre_advantage

            pre_return = returns[i]
            pre_value = values[i]
            pre_advantage = advantages[i]

        return returns, deltas, advantages

    def update(self, o, a, r_i, r_e, mask, o_, log_prob):
        self.normalizer_obs.update(o_.reshape(-1, 4, 84, 84).copy())
        self.normalizer_ri.update(r_i.reshape(-1).copy())

        r_i = self.normalizer_ri.normalize(r_i)
        o_ = self.normalizer_obs.normalize(o_)
        o = torch.from_numpy(o).to(self.device).float() / 255.

        returns_ex = np.zeros_like(r_e)
        returns_in = np.zeros_like(r_e)
        advantage_ex = np.zeros_like(r_e)
        advantage_in = np.zeros_like(r_e)
        for i in range(r_e.shape[0]):
            action_logits, value_ex, value_in = self.actor_critic(o[i])
            value_ex, value_in = value_ex.cpu().detach().numpy(), value_in.cpu(
            ).detach().numpy()
            returns_ex[i], _, advantage_ex[i] = self.GAE_caculate(
                r_e[i], mask[i], value_ex, self.gamma_e, self.lamda)  #episodic
            returns_in[i], _, advantage_in[i] = self.GAE_caculate(
                r_i[i], np.ones_like(mask[i]), value_in, self.gamma_i,
                self.lamda)  #non_episodic

        o = o.reshape((-1, 4, 84, 84))
        a = np.reshape(a, -1)
        o_ = np.reshape(o_[:, :, 3, :, :], (-1, 1, 84, 84))
        log_prob = np.reshape(log_prob, -1)
        returns_ex = np.reshape(returns_ex, -1)
        returns_in = np.reshape(returns_in, -1)
        advantage_ex = np.reshape(advantage_ex, -1)
        advantage_in = np.reshape(advantage_in, -1)

        a = torch.from_numpy(a).float().to(self.device)
        o_ = torch.from_numpy(o_).float().to(self.device).float()
        log_prob = torch.from_numpy(log_prob).float().to(self.device)
        returns_ex = torch.from_numpy(returns_ex).float().to(
            self.device).unsqueeze(dim=1)
        returns_in = torch.from_numpy(returns_in).float().to(
            self.device).unsqueeze(dim=1)
        advantage_ex = torch.from_numpy(advantage_ex).float().to(self.device)
        advantage_in = torch.from_numpy(advantage_in).float().to(self.device)

        sample_range = list(range(len(o)))

        for i_update in range(self.update_epoch):
            np.random.shuffle(sample_range)
            for j in range(int(len(o) / self.batch_size)):
                idx = sample_range[self.batch_size * j:self.batch_size *
                                   (j + 1)]
                #update RND
                pred_RND, tar_RND = self.RND(o_[idx])
                loss_RND = F.mse_loss(pred_RND,
                                      tar_RND.detach(),
                                      reduction='none').mean(-1)
                mask = torch.randn(len(loss_RND)).to(self.device)
                mask = (mask < self.update_proportion).type(
                    torch.FloatTensor).to(self.device)
                loss_RND = (loss_RND * mask).sum() / torch.max(
                    mask.sum(),
                    torch.Tensor([1]).to(self.device))

                #update actor-critic
                action_logits, value_ex, value_in = self.actor_critic(o[idx])
                advantage = self.ex_coef * advantage_ex[
                    idx] + self.in_coef * advantage_in[idx]
                dist = Categorical(action_logits)
                new_log_prob = dist.log_prob(a[idx])

                ratio = torch.exp(new_log_prob - log_prob[idx])
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1 - self.clip_eps,
                                    1 + self.clip_eps) * advantage
                loss_actor = torch.min(
                    surr1,
                    surr2).mean() - self.entropy_coef * dist.entropy().mean()
                loss_critic = F.mse_loss(value_ex,
                                         returns_ex[idx]) + F.mse_loss(
                                             value_in, returns_in[idx])

                loss_ac = loss_actor + 0.5 * loss_critic

                loss = loss_RND + loss_ac
                self.optimizer.zero_grad()
                loss.backward()
                global_grad_norm_(
                    list(self.actor_critic.parameters()) +
                    list(self.RND.predictor.parameters()))
                self.optimizer.step()

        return loss_RND.cpu().detach().numpy(), loss_actor.cpu().detach(
        ).numpy(), loss_critic.cpu().detach().numpy()

    def save_model(self, remark):
        if not os.path.exists('pretrained_models_PPO/'):
            os.mkdir('pretrained_models_PPO/')
        path = 'pretrained_models_PPO/{}.pt'.format(remark)
        print('Saving model to {}'.format(path))
        torch.save(self.actor_critic.state_dict(), path)

    def load_model(self, load_model_remark):
        print('Loading models with remark {}'.format(load_model_remark))
        model = torch.load(
            'pretrained_models_PPO/{}.pt'.format(load_model_remark),
            map_location=lambda storage, loc: storage)
        self.actor_critic.load_state_dict(model)
Exemplo n.º 2
0
class DDPG_Agent():
    def __init__(self, args, env_params):
        self.s_dim = env_params['o_dim'] + env_params['g_dim']
        self.a_dim = env_params['a_dim']
        self.f_dim = args.f_dim
        self.action_bound = env_params['action_max']
        self.max_timestep = env_params['max_timestep']
        self.max_episode = args.max_episode
        self.evaluate_episode = args.evaluate_episode
        self.evaluate_interval = args.evaluate_interval
        self.log_interval = args.log_interval
        self.save_model_interval = args.save_model_interval
        self.save_model_start = args.save_model_start

        self.lr = args.lr
        self.lr_model = args.lr_model
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.eta = args.eta
        self.noise_eps = args.noise_eps
        self.device = torch.device(args.device)

        self.normalizer_s = Normalizer(size=self.s_dim,
                                       eps=1e-2,
                                       clip_range=1.)

        self.memory = Memory(size=args.memory_size,
                             s_dim=self.s_dim,
                             a_dim=self.a_dim)

        self.policy = Policy(s_dim=self.s_dim,
                             a_dim=self.a_dim).to(self.device)
        self.policy_target = Policy(s_dim=self.s_dim,
                                    a_dim=self.a_dim).to(self.device)
        self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device)
        self.Q_target = QFunction(s_dim=self.s_dim,
                                  a_dim=self.a_dim).to(self.device)

        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr)

        self.encoder = StateEncoder(s_dim=self.s_dim,
                                    f_dim=self.f_dim).to(self.device)
        self.EnvForward = ForwardModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)
        self.EnvInverse = InverseModel(f_dim=self.f_dim,
                                       a_dim=self.a_dim).to(self.device)

        self.optimizer_forward = optim.Adam(
            [{
                'params': self.EnvForward.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)
        self.optimizer_inverse = optim.Adam(
            [{
                'params': self.EnvInverse.parameters()
            }, {
                'params': self.encoder.parameters()
            }],
            lr=self.lr_model)

        self.hard_update()

        self.update_num = 0

    def select_action(self, state, train_mode=True):
        s = self.normalize_input(state)
        s = torch.tensor(state, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            action = self.policy(s).cpu().numpy()

        if train_mode:
            action += np.random.randn(
                self.a_dim
            ) * self.noise_eps * self.action_bound  #Gaussian Noise
        else:
            pass

        action = np.clip(action,
                         a_min=-self.action_bound,
                         a_max=self.action_bound)
        return action

    def get_intrisic_reward(self, s, a, s_):
        s, a, s_ = torch.from_numpy(s).to(
            self.device).float(), torch.from_numpy(a).to(
                self.device).float(), torch.from_numpy(s_).to(
                    self.device).float()
        with torch.no_grad():
            feature = self.encoder(s)
            next_feature_pred = self.EnvForward(feature, a)
            next_feature = self.encoder(s_)
        r_i = self.eta * torch.norm(next_feature_pred - next_feature)
        r_i = torch.clamp(r_i, min=-0.1, max=0.1)
        return r_i.cpu().detach().numpy()

    def train(self, env, logger=None):
        total_step = 0
        loss_pi, loss_q, loss_forward, loss_inverse = 0., 0., 0., 0.
        for i_episode in range(self.max_episode):
            obs = env.reset()
            s = get_state(obs)

            cumulative_r = 0.
            for i_step in range(self.max_timestep):
                a = self.select_action(s)
                obs_, r_e, done, info = env.step(a)
                s_ = get_state(obs_)

                r_i = self.get_intrisic_reward(s, a, s_)
                r = r_e + r_i

                self.memory.store(s, a, r, s_)
                s = s_

                if len(self.memory) > self.batch_size:
                    loss_pi, loss_q, loss_forward, loss_inverse = self.learn()
                cumulative_r += r_e
                total_step += 1

            print(
                'i_episode: {} total step: {} cumulative reward: {:.4f} is_success: {} '
                .format(i_episode, total_step, cumulative_r,
                        info['is_success']))
            if logger is not None and i_episode % self.log_interval == 0:
                logger.add_scalar('Indicator/cumulative reward', cumulative_r,
                                  i_episode)
                logger.add_scalar('Loss/pi_loss', loss_pi, i_episode)
                logger.add_scalar('Loss/q_loss', loss_q, i_episode)
                logger.add_scalar('Loss/forward_loss', loss_forward, i_episode)
                logger.add_scalar('Loss/inverse_loss', loss_inverse, i_episode)
            if i_episode % self.evaluate_interval == 0:
                success_rate = self.evaluate(env)
                if logger is not None:
                    logger.add_scalar('Indicator/success rate', success_rate,
                                      i_episode)

            if i_episode > self.save_model_start and i_episode % self.save_model_interval == 0:
                self.save_model(remarks='{}_{}'.format(env.spec.id, i_episode))

    def evaluate(self, env, render=False):
        success_count = 0
        for i_episode in range(self.evaluate_episode):
            obs = env.reset()
            s = get_state(obs)
            for i_step in range(self.max_timestep):
                if render:
                    env.render()
                a = self.select_action(s, train_mode=False)
                obs_, r_e, done, info = env.step(a)
                s_ = get_state(obs_)
                s = s_
            success_count += info['is_success']

        return success_count / self.evaluate_episode

    def learn(self):
        s, a, r, s_ = self.memory.sample_batch(batch_size=self.batch_size)
        self.normalizer_s.update(s)

        s, s_ = self.normalize_input(s, s_)
        s = torch.from_numpy(s).to(self.device)
        a = torch.from_numpy(a).to(self.device)
        r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1)
        s_ = torch.from_numpy(s_).to(self.device)

        #update policy and Q
        with torch.no_grad():
            a_next_tar = self.policy_target(s_)
            Q_next_tar = self.Q_target(s_, a_next_tar)
            loss_q_tar = r + self.gamma * Q_next_tar
        loss_q_pred = self.Q(s, a)
        loss_q = F.mse_loss(loss_q_pred, loss_q_tar.detach())
        self.optimizer_q.zero_grad()
        loss_q.backward()
        self.optimizer_q.step()

        loss_p = -self.Q(s, self.policy(s)).mean()
        self.optimizer_p.zero_grad()
        loss_p.backward()
        self.optimizer_p.step()

        self.soft_update()

        #update env model and encoder
        feature = self.encoder(s)
        next_feature = self.encoder(s_)
        a_pred = self.EnvInverse(feature, next_feature)
        loss_inverse = F.mse_loss(a_pred, a)

        next_feature_pred = self.EnvForward(feature, a)
        with torch.no_grad():
            next_feature_tar = self.encoder(s_)
        loss_forward = F.mse_loss(next_feature_pred, next_feature_tar.detach())

        self.optimizer_forward.zero_grad()
        self.optimizer_inverse.zero_grad()
        loss_forward.backward(retain_graph=True)
        loss_inverse.backward()
        self.optimizer_forward.step()
        self.optimizer_inverse.step()

        self.update_num += 1
        return loss_p.cpu().detach().numpy(), loss_q.cpu().detach().numpy(
        ), loss_forward.cpu().detach().numpy(), loss_inverse.cpu().detach(
        ).numpy()

    def update_normalizer(self, states):
        states = np.array(states, dtype=np.float32)
        self.normalizer_s.update(states)

    def hard_update(self):
        self.policy_target.load_state_dict(self.policy.state_dict())
        self.Q_target.load_state_dict(self.Q.state_dict())

    def soft_update(self):
        for param, param_target in zip(self.policy.parameters(),
                                       self.policy_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))
        for param, param_target in zip(self.Q.parameters(),
                                       self.Q_target.parameters()):
            param_target.data.copy_(param.data * self.tau + param_target.data *
                                    (1 - self.tau))

    def normalize_input(self, s, s_=None):
        s = self.normalizer_s.normalize(s)
        if s_ is not None:
            s_ = self.normalizer_s.normalize(s_)
            return s, s_
        else:
            return s

    def save_model(self, remarks):
        if not os.path.exists('pretrained_models_DDPG/'):
            os.mkdir('pretrained_models_DDPG/')
        path = 'pretrained_models_DDPG/{}.pt'.format(remarks)
        print('Saving model to {}'.format(path))
        torch.save([
            self.normalizer_s.mean, self.normalizer_s.std,
            self.policy.state_dict()
        ], path)

    def load_model(self, remark):
        print('Loading models with remark {}'.format(remark))
        self.normalizer_s.mean, self.normalizer_s.std, policy_model = torch.load(
            'pretrained_models_DDPG/{}.pt'.format(remark),
            map_location=lambda storage, loc: storage)
        self.policy.load_state_dict(policy_model)
Exemplo n.º 3
0
class ddpgAgent(object):
    def __init__(self, params):
        """Implementation of DDPG agent with Hindsight Experience Replay (HER) sampler.

        @param params: dict containing all necessary parameters:
        dims, buffer_size, tau (= 1-polyak), batch_size, lr_critic, lr_actor, norm_eps, norm_clip, clip_obs,
        clip_action, T (episode length), num_workers, clip_return, sample_her_transitions, gamma, replay_strategy
        """
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.input_dims = params['dims']
        self.buffer_size = params['buffer_size']
        self.tau = params['tau']
        self.batch_size = params['batch_size']
        self.critic_lr = params['lr_critic']
        self.actor_lr = params['lr_actor']
        self.norm_eps = params['norm_eps']
        self.norm_clip = params['norm_clip']
        self.clip_obs = params['clip_obs']
        self.clip_action = params['clip_action']

        self.T = params['T']
        self.rollout_batch_size = params['num_workers']
        self.clip_return = params['clip_return']
        self.sample_transitions = params['sample_her_transitions']
        self.gamma = params['gamma']
        self.replay_strategy = params['replay_strategy']

        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, self.input_dims[key])
        stage_shapes['o_2'] = stage_shapes['o']
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # normalizer
        self.obs_normalizer = Normalizer(size=self.dimo, eps=self.norm_eps, clip_range=self.norm_clip)
        self.goal_normalizer = Normalizer(size=self.dimg, eps=self.norm_eps, clip_range=self.norm_clip)

        # networks
        self.actor_local = Actor(self.input_dims).to(self.device)
        self.critic_local = Critic(self.input_dims).to(self.device)
        self.actor_target = copy.deepcopy(self.actor_local)
        self.critic_target = copy.deepcopy(self.critic_local)

        # optimizers
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr)

        # Configuring the replay buffer
        buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, self.input_dims[key])
                         for key, val in self.input_dims.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)
        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size

        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)


    def act(self, o, g, noise_eps=0., random_eps=0., testing=False):
        """Choose action from observations with probability 'random_eps' at random,
        else use actor output and add noise 'noise_eps'

        @param o: observation
        @param g: desired goal
        @param noise_eps: noise added to action
        @param random_eps: random action probability
        @param testing: (bool) set to 'True' if testing a single environment
        """

        obs = self.obs_normalizer.normalize(o)
        goals = self.goal_normalizer.normalize(g)

        obs = torch.tensor(obs).to(self.device)
        goals = torch.tensor(goals).to(self.device)

        # for testing single environment
        if testing:
            with torch.no_grad():
                action = self.actor_local(torch.cat([obs, goals], dim=0)).cpu().data.numpy()
            return action

        actions = self.actor_local(torch.cat([obs, goals], dim=1))

        noise = (noise_eps * np.random.randn(actions.shape[0], 4)).astype(np.float32)
        actions += torch.tensor(noise).to(self.device)

        eps_greedy_noise = np.random.binomial(1, random_eps, actions.shape[0]).reshape(-1, 1)

        random_action = torch.tensor(np.random.uniform(
            low=-1., high=1., size=(actions.shape[0], self.dimu)).astype(np.float32)).to(self.device)

        actions += torch.tensor(eps_greedy_noise.astype(np.float32)).to(self.device) * (
                    random_action - actions)  # eps-greedy

        actions = torch.clamp(actions, -self.clip_action, self.clip_action)

        return actions

    def store_episode(self, episode_batch):
        """Store episodes to replay buffer.

        @param episode_batch: array of batch_size x (T or T+1) x dim_key.
        Observation 'o' is of size T+1, others are of size T
        """
        self.buffer.store_episode(episode_batch)

        # add transitions to normalizer
        episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
        episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
        shape = episode_batch['u'].shape
        num_normalizing_transitions = shape[0] * shape[1]  # num_rollouts * (T - 1), steps every cycle
        transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

        self.obs_normalizer.update(transitions['o'])
        self.goal_normalizer.update(transitions['g'])

        self.obs_normalizer.recompute_stats()
        self.goal_normalizer.recompute_stats()

    def sample_batch(self):
        """Sample random transitions from replay buffer (which also contains HER samples).

        @return: transitions
        """
        transitions = self.buffer.sample(self.batch_size)
        return [transitions[key] for key in self.stage_shapes.keys()]

    def learn(self):
        """learning step i.e. optimizing the network.
        """
        batch = self.sample_batch()
        batch_dict = OrderedDict([(key, batch[i].astype(np.float32).copy())
                                  for i, key in enumerate(self.stage_shapes.keys())])
        batch_dict['r'] = np.reshape(batch_dict['r'], [-1, 1])

        # prepare state, action, reward, next state
        obs = torch.tensor(self.obs_normalizer.normalize(batch_dict['o'])).to(self.device)
        goal = torch.tensor(self.goal_normalizer.normalize(batch_dict['g'])).to(self.device)
        actions = torch.tensor(batch_dict['u']).to(self.device)
        rewards = torch.tensor(batch_dict['r'].astype(np.float32)).to(self.device)
        obs_2 = torch.tensor(self.obs_normalizer.normalize(batch_dict['o_2'])).to(self.device)

        # update critic --------------------------------------------------------------

        # compute predicted Q values
        next_actions = self.actor_target(torch.cat([obs_2, goal], dim=1))
        next_Q_targets = self.critic_target(torch.cat([obs_2, goal], dim=1), next_actions)

        # compute Q values for current states and clip them
        Q_targets = rewards + self.gamma * next_Q_targets          # Note: last experience of episode is not included
        Q_targets = torch.clamp(Q_targets, -self.clip_return, 0.)  # clipping

        # compute loss
        Q_expected = self.critic_local(torch.cat([obs, goal], dim=1), actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # update weights critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # update actor -------------------------------------------------------------

        # compute loss
        pred_actions = self.actor_local(torch.cat([obs, goal], dim=1))
        actor_loss = -self.critic_local(torch.cat([obs, goal], dim=1), pred_actions).mean()
        actor_loss += (pred_actions ** 2).mean()  # minimize action moments

        # update weights actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def soft_update_target_networks(self):
        """Soft update model parameters:
            θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        # update critic net
        for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
        # update actor net
        for target_param, local_param in zip(self.actor_target.parameters(), self.actor_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

    def save_checkpoint(self, path, name):
        """Save actor, critic networks and the stats for normalization to the path.

        @param path: path to store checkpoints
        @param name: (str) name of environment, for naming files
        """
        torch.save(self.actor_local.state_dict(), path + '/'+name+'_checkpoint_actor_her.pth')
        torch.save(self.critic_local.state_dict(), path + '/'+name+'_checkpoint_critic_her.pth')
        self.obs_normalizer.save_normalizer(path + '/'+name+'_obs_normalizer.pth')
        self.goal_normalizer.save_normalizer(path + '/'+name+'_goal_normalizer.pth')