示例#1
0
    def __init__(self,
                 env: Env = None,
                 capacity=2e6,
                 batch_size=128,
                 action_lim=1,
                 learning_rate=0.001,
                 gamma=0.999,
                 epochs=2):
        if env is None:
            raise "agent should have an environment"
        super(DDPGAgent, self).__init__(env, capacity)
        self.state_dim = env.observation_space.shape[0]  # 状态连续
        self.action_dim = env.action_space.shape[0]  # 行为连续
        self.action_lim = action_lim  # 行为值限制
        self.batch_size = batch_size  # 批学习一次状态转换数量
        self.learning_rate = learning_rate  # 学习率
        self.gamma = gamma  # 衰减因子
        self.epochs = epochs  # 统一批状态转换学习的次数
        self.tau = 0.001  # 软拷贝的系数
        self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim)
        self.actor = Actor(self.state_dim, self.action_dim, self.action_lim)
        self.target_actor = Actor(self.state_dim, self.action_dim,
                                  self.action_lim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate)
        self.critic = Critic(self.state_dim, self.action_dim)
        self.target_critic = Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 self.learning_rate)

        hard_update(self.target_actor, self.actor)  # 硬拷贝
        hard_update(self.target_critic, self.critic)  # 硬拷贝
        return
示例#2
0
    def __init__(self, env):
        super(DDPG, self).__init__()

        pi_net = PiNet(self.ns, self.na)
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na)
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net = QNet(self.ns, self.na)
        self.q_net = q_net.to(self.device)

        q_target = QNet(self.ns, self.na)
        self.q_target = q_target.to(self.device)
        self.load_state_dict(self.q_target, self.q_net.state_dict())

        self.optimizer_q = torch.optim.Adam(self.q_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=1e-2)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=0)

        self.noise = OrnsteinUhlenbeckActionNoise(
            torch.zeros(1, self.na).to(self.device),
            self.epsilon * torch.ones(1, self.na).to(self.device))
示例#3
0
    def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim,
                 gamma, batch_size):
        self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target')
        self.s = tf.placeholder(tf.float32, [None, state_dim], 'state')
        self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state')

        self.memory = ReplayBuffer(max_size=10000)
        self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
        self.batch_size = batch_size
        self.gamma = gamma

        self.sess = tf.Session()

        self.actor = Actor(self.sess,
                           self.s,
                           self.s_,
                           action_dim,
                           action_bound,
                           tau,
                           lr_a,
                           f1_units=300)
        self.critic = Critic(self.sess,
                             lr_c,
                             self.s,
                             self.s_,
                             self.actor.a,
                             self.actor.a_,
                             self.target,
                             tau,
                             gamma,
                             state_dim,
                             action_dim,
                             f1_units=300)
        self.actor.add_grad_to_graph(self.critic.a_g)

        self.sess.run(tf.global_variables_initializer())
示例#4
0
def train(
    env,
    actor_learning_rate=0.0001,
    critic_learning_rate=0.001,
    gamma=0.99,
    tau=0.001,
    max_episodes=100,
    buffer_size=1000000,
    batch_size=64,
    plot_flag=True,
    verbose=True,
    save_dir=None,
):
    if os.path.exists(os.path.join(save_dir, "epi_rwds.npy")):
        return np.load(os.path.join(save_dir, "epi_rwds.npy"))

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # setup learner
    obs_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_gain = env.action_space.high
    learner = DDPG(
        obs_dim,
        action_dim,
        action_gain,
        actor_learning_rate=actor_learning_rate,
        critic_learning_rate=critic_learning_rate,
        gamma=gamma,
        tau=tau,
    )

    # setup other reqs -  buffer, noise
    buffer = ReplayBuffer(buffer_size)
    noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
    epi_rwds = []

    # train
    # 玩100盘

    for e in range(0, max_episodes + 200):
        print('game no.', e, end=' ')
        run_episode(learner, env, noise, buffer, batch_size=64)

        if e % 10 == 0:
            # testing without noise at the end of each epoch
            # one epoch = 10 episodes
            e_rwds = 0
            for _ in range(10):
                e_rwds += run_episode(learner,
                                      env,
                                      None,
                                      buffer,
                                      train_flag=False)
            print(
                "End of epoch # {} | {} training episodes completed | Total reward = {}"
                .format(e // 5, e, e_rwds / 10))
            epi_rwds.append(e_rwds / 10)

    # save trained model
    if save_dir:
        learner.save_model(save_dir)

    # plot training curve
    if plot_flag:
        plt.plot(epi_rwds)
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.savefig(os.path.join(save_dir, "training_curve.png"))
        plt.show()

    return epi_rwds
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    parser.add_argument("--input-shape", type=list, default=[None, 1])
    parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument('--tau',
                        help='soft target update parameter',
                        default=0.001)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--gamma", type=float, default=.99)

    parser.add_argument("--K",
                        type=int,
                        default=1,
                        help='The number of steps to train the environment')
    parser.add_argument(
        "--L",
        type=int,
        default=1,
        help='The number of Q-learning steps for hypothetical rollouts')
    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    args = parser.parse_args()

    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.input_shape = [None, args.state_dim]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound = env.action_space.high
    print(args)

    # Networks
    actor_source = actor(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        output_bound=args.action_bound[0],\
        scope='actor_source')
    critic_source = critic(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        scope='critic_source')
    actor_target = actor(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        output_bound=args.action_bound[0],\
        scope='actor_target')
    critic_target = critic(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        scope='critic_target')

    # Initialize the GANs
    cgan_state = CGAN(input_shape=args.input_shape,\
        action_size=args.action_dim,\
        latent_size=args.latent_size,\
        gen_input_shape=args.input_shape,\
        continuous_action=True)
    cgan_reward = CGAN(input_shape=args.input_shape,\
        action_size=args.action_dim,\
        latent_size=args.latent_size,\
        gen_input_shape=[None, 1],\
        continuous_action=True)

    # Update and copy operators
    update_target_actor = update_target_graph2('actor_source', 'actor_target',
                                               args.tau)
    update_target_critic = update_target_graph2('critic_source',
                                                'critic_target', args.tau)

    copy_target_actor = update_target_graph2('actor_source', 'actor_target',
                                             1.)
    copy_target_critic = update_target_graph2('critic_source', 'critic_target',
                                              1.)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(args.action_dim))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(copy_target_critic)
        sess.run(copy_target_actor)

        for epoch in range(args.epochs):
            state = env.reset()
            total_rewards = 0.0
            while True:
                #env.render()
                # Choose an action
                action = sess.run(
                    actor_source.action,
                    feed_dict={actor_source.states: state[np.newaxis, ...]
                               })[0] + actor_noise()
                # Execute action
                state1, reward, done, _ = env.step(action)
                total_rewards += float(reward)
                # Store tuple in replay memory
                memory.add([state[np.newaxis, ...],\
                    action[np.newaxis, ...],\
                    reward,\
                    state1[np.newaxis, ...],\
                    done])

                # Training step: update actor critic using real experience
                batch = np.array(memory.sample(args.batch_size))
                assert len(batch) > 0
                states = np.concatenate(batch[:, 0], axis=0)
                actions = np.concatenate(batch[:, 1], axis=0)
                rewards = batch[:, 2]
                states1 = np.concatenate(batch[:, 3], axis=0)
                dones = batch[:, 4]

                # Update the critic
                actions1 = sess.run(actor_target.action,\
                    feed_dict={actor_target.states:states1})
                targetQ = np.squeeze(sess.run(critic_target.Q,\
                    feed_dict={critic_target.states:states1,\
                        critic_target.actions:actions1}), axis=-1)
                targetQ = rewards + (
                    1. - dones.astype(np.float32)) * args.gamma * targetQ
                targetQ = targetQ[..., np.newaxis]
                _, critic_loss = sess.run([critic_source.critic_solver,\
                    critic_source.loss],\
                    feed_dict={critic_source.states:states,\
                        critic_source.actions:actions,\
                        critic_source.targetQ:targetQ})

                # Update the actor
                critic_grads = sess.run(critic_source.grads,\
                    feed_dict={critic_source.states:states,\
                        critic_source.actions:actions})[0]# Grab gradients from critic
                _ = sess.run(actor_source.opt,\
                    feed_dict={actor_source.states:states,\
                        actor_source.dQ_by_da:critic_grads})

                # Update target networks
                sess.run(update_target_critic)
                sess.run(update_target_actor)

                # Training step: update the environment model using real experience (i.e., update the conditional GANs)
                for k in range(args.K):
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.concatenate(batch[:, 0], axis=0)
                    actions = np.concatenate(batch[:, 1], axis=0)
                    rewards = batch[:, 2]
                    states1 = np.concatenate(batch[:, 3], axis=0)

                    _, D_loss_state = sess.run([cgan_state.D_solver, cgan_state.D_loss],\
                        feed_dict={cgan_state.states:states,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size),\
                            cgan_state.X:states1})

                    _, G_loss_state = sess.run([cgan_state.G_solver,\
                        cgan_state.G_loss],\
                        feed_dict={cgan_state.states:states,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size)})

                    _, D_loss_reward = sess.run([cgan_reward.D_solver,\
                        cgan_reward.D_loss],\
                        feed_dict={cgan_reward.states:states,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size),\
                            cgan_reward.X:rewards[..., np.newaxis]})

                    _, G_loss_reward = sess.run([cgan_reward.G_solver,\
                        cgan_reward.G_loss],\
                        feed_dict={cgan_reward.states:states,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size)})
                    #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state

                # Training step: update actor critic using imagination rollouts
                for l in range(args.L):
                    batch = np.array(memory.sample(args.batch_size))
                    states_ = np.concatenate(batch[:, 3], axis=0)
                    actions = np.random.uniform(env.action_space.low[0],\
                        env.action_space.high[0],\
                        size=(len(batch),\
                        env.action_space.shape[0]))
                    dones = np.array([False] * len(batch))

                    G_sample_state = sess.run(cgan_state.G_sample,\
                        feed_dict={cgan_state.states:states_,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size)})
                    G_sample_reward = sess.run(cgan_reward.G_sample,\
                        feed_dict={cgan_reward.states:states_,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size)})
                    G_sample_reward = np.squeeze(G_sample_reward, axis=-1)

                    # Update the critic
                    actions1 = sess.run(actor_target.action,\
                        feed_dict={actor_target.states:G_sample_state})
                    targetQ = np.squeeze(sess.run(critic_target.Q,\
                        feed_dict={critic_target.states:G_sample_state,\
                            critic_target.actions:actions1}), axis=-1)
                    targetQ = G_sample_reward + (
                        1. - dones.astype(np.float32)) * args.gamma * targetQ
                    targetQ = targetQ[..., np.newaxis]
                    _, critic_loss = sess.run([critic_source.critic_solver,\
                        critic_source.loss],\
                        feed_dict={critic_source.states:states_,\
                            critic_source.actions:actions,\
                            critic_source.targetQ:targetQ})

                    # Update the actor
                    critic_grads = sess.run(critic_source.grads,\
                        feed_dict={critic_source.states:states_,\
                            critic_source.actions:actions})[0]# Grab gradients from critic
                    _ = sess.run(actor_source.opt,\
                        feed_dict={actor_source.states:states_,\
                            actor_source.dQ_by_da:critic_grads})

                    # Update target networks
                    sess.run(update_target_critic)
                    sess.run(update_target_actor)

                state = np.copy(state1)
                if done == True:
                    print 'epoch', epoch, 'total rewards', total_rewards
                    break
示例#6
0
文件: main.py 项目: yosider/RLSnipets
CRITIC_LEARNING_RATE = 0.001
BUFFER_SIZE = 1000000
MINIBATCH_SIZE = 64
MAX_EPISODES = 500
MAX_EP_STEPS = 1000
GAMMA = 0.99

with tf.Session() as sess:

    critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, TAU,
                           CRITIC_LEARNING_RATE)
    actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                         MINIBATCH_SIZE, TAU, ACTOR_LEARNING_RATE,
                         critic.model)

    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

    #TODO: Ornstein-Uhlenbeck noise.

    sess.run(tf.global_variables_initializer())

    # initialize target net
    actor.update_target_network()
    critic.update_target_network()

    # initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE)

    # log variables
    reward_log = []
class DDPGAgent(Agent):
    '''使用Actor-Critic算法结合深度学习的个体
        接收一个环境对象,同时接受相关的学习参数
        从env得到state和action的特征数目
        继承自基类Agent
    '''
    def __init__(self,
                 env: Env = None,
                 capacity=2e6,
                 batch_size=128,
                 action_lim=1,
                 learning_rate=0.001,
                 gamma=0.999,
                 epochs=2):
        if env is None:
            raise "agent 缺少环境env"
        super(DDPGAgent, self).__init__(env, capacity)
        self.state_dim = env.observation_space.shape[0]  # 状态连续,获取状态维度
        self.action_dim = env.action_space.shape[0]  # 行为连续,获取行为维度
        self.action_lim = action_lim  # 行为值限制
        self.batch_size = batch_size  # 批学习一次状态转换数量
        self.learning_rate = learning_rate  # 学习率
        self.gamma = 0.999  # 衰减因子
        self.epochs = epochs  # 统一批状态转换学习的次数
        self.tau = 0.01  # 软拷贝的系数
        self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim)

        self.actor = Actor(self.state_dim, self.action_dim, self.action_lim)
        self.target_actor = Actor(self.state_dim, self.action_dim,
                                  self.action_lim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate)

        self.critic = Critic(self.state_dim, self.action_dim)
        self.target_critic = Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 self.learning_rate)
        # 初始化
        # 将critic和actor网络的参数硬拷贝到目标网络target_critic和target_actor
        hard_update(self.target_actor, self.actor)  # 硬拷贝
        hard_update(self.target_critic, self.critic)  # 硬拷贝
        return

    def get_exploitation_action(self, state):
        '''得到给定状态下依据target_actor网络计算出的行为,不探索
        Args:
            state numpy数组
        Returns:
            action numpy 数组
        '''
        action = self.target_actor.forward(state).detach()
        return action.data.numpy()

    def get_exploration_action(self, state):
        '''得到给定状态下根据actor计算出的带噪声的行为,模拟一定的探索
        Args:
            state numpy数组
        Returns:
            action numpy 数组
        '''
        action = self.actor.forward(state).detach()
        new_action = action.data.numpy() + (self.noise.sample() *
                                            self.action_lim)
        new_action = new_action.clip(min=-1 * self.action_lim,
                                     max=self.action_lim)
        return new_action

    def _learn_from_memory(self):
        '''从experience学习,更新两个网络的参数;
        '''
        # 随机获取记忆里的Transmition
        trans_pieces = self.sample(self.batch_size)
        s0 = np.vstack([x.s0 for x in trans_pieces])
        a0 = np.array([x.a0 for x in trans_pieces])
        r1 = np.array([x.reward for x in trans_pieces])
        s1 = np.vstack([x.s1 for x in trans_pieces])

        # 优化critic网络参数,最小化loss
        a1 = self.target_actor.forward(s1).detach()
        next_val = torch.squeeze(self.target_critic.forward(s1, a1).detach())
        r1 = torch.from_numpy(r1)
        # r1 = r1.type(torch.DoubleTensor)
        next_val = next_val.type(torch.DoubleTensor)
        y_expected = r1 + self.gamma * next_val
        y_expected = y_expected.type(torch.FloatTensor)

        a0 = torch.from_numpy(a0)  # 转换成Tensor
        y_predicted = torch.squeeze(self.critic.forward(s0, a0))

        # 最小化loss,更新critic
        loss_critic = F.smooth_l1_loss(y_predicted, y_expected)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # 优化actor网络参数,优化的目标是使得Q增大
        pred_a0 = self.actor.forward(s0)  # 直接使用a0会不收敛
        #反向梯度下降(梯度上升),以某状态的价值估计为策略目标函数
        loss_actor = -1 * torch.sum(self.critic.forward(s0, pred_a0))
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        # 软更新参数,跟新target网络
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
        return (loss_critic.item(), loss_actor.item())

    def learning_method(self, display=False, explore=True):
        '''
        负责agent与env的交互
        实现一个完整的episode,遍历

        返回
            time_in_episode 当前episode的步数
            total_reward 当前episode的总奖励
        '''
        self.state = np.float64(self.env.reset())  #初始化episode的状态
        time_in_episode, total_reward = 0, 0
        is_done = False  #episode的结束标志
        loss_critic, loss_actor = 0.0, 0.0
        while not is_done:
            # add code here
            s0 = self.state  #获取当前状态
            if explore:
                a0 = self.get_exploration_action(s0)
            else:
                a0 = self.actor.forward(s0).detach().data.numpy()

            s1, r1, is_done, info, total_reward = self.act(
                a0)  #执行动作,并获得观测值,并将当前trans序列s0,a0,r0,s1存储
            if display:
                self.env.render()

            if self.total_trans > self.batch_size:  #若当前的trans的数量大于batch size,开始基于experience replay的学习
                loss_c, loss_a = self._learn_from_memory()
                loss_critic += loss_c
                loss_actor += loss_a

            time_in_episode += 1
        loss_critic /= time_in_episode
        loss_actor /= time_in_episode
        if display:
            print("{}".format(self.experience.last_episode))
        return time_in_episode, total_reward, loss_critic, loss_actor

    def learning(self, max_episode_num=800, display=False, explore=True):
        '''
        迭代所有的episode
        并在训练过程中保存训练结果
        '''
        total_time, episode_reward, num_episode = 0, 0, 0
        total_times, episode_rewards, num_episodes = [], [], []
        for i in tqdm(range(max_episode_num)):  #遍历所有episode
            time_in_episode, episode_reward, loss_critic, loss_actor = \
                self.learning_method(display = display, explore = explore) #对该episode进行学习
            total_time += time_in_episode  #总的步数
            num_episode += 1  #总的episode数
            total_times.append(total_time)  #存储每个episode的times步数
            episode_rewards.append(episode_reward)  #存储每个episode的reward
            num_episodes.append(num_episode)  #存储每个episode的索引
            print("episode:{:3}:loss critic:{:4.3f}, J_actor:{:4.3f}".\
                  format(num_episode-1, loss_critic, -loss_actor))
            if explore and num_episode % 100 == 0:  # 保存模型
                self.save_models(num_episode)
        return total_times, episode_rewards, num_episodes

    def save_models(self, episode_count):
        torch.save(self.target_actor.state_dict(),
                   './Models/' + str(episode_count) + '_actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './Models/' + str(episode_count) + '_critic.pt')
        print("Models saved successfully")

    def load_models(self, episode):
        self.actor.load_state_dict(
            torch.load('./Models/' + str(episode) + '_actor.pt'))
        self.critic.load_state_dict(
            torch.load('./Models/' + str(episode) + '_critic.pt'))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        print("Models loaded succesfully")
示例#8
0
class DDPGAgent(Agent):
    '''
    使用Actor-Critic算法结合深度学习的个体
    '''
    def __init__(self,
                 env: Env = None,
                 capacity=2e6,
                 batch_size=128,
                 action_lim=1,
                 learning_rate=0.001,
                 gamma=0.999,
                 epochs=2):
        if env is None:
            raise "agent should have an environment"
        super(DDPGAgent, self).__init__(env, capacity)
        self.state_dim = env.observation_space.shape[0]  # 状态连续
        self.action_dim = env.action_space.shape[0]  # 行为连续
        self.action_lim = action_lim  # 行为值限制
        self.batch_size = batch_size  # 批学习一次状态转换数量
        self.learning_rate = learning_rate  # 学习率
        self.gamma = gamma  # 衰减因子
        self.epochs = epochs  # 统一批状态转换学习的次数
        self.tau = 0.001  # 软拷贝的系数
        self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim)
        self.actor = Actor(self.state_dim, self.action_dim, self.action_lim)
        self.target_actor = Actor(self.state_dim, self.action_dim,
                                  self.action_lim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate)
        self.critic = Critic(self.state_dim, self.action_dim)
        self.target_critic = Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 self.learning_rate)

        hard_update(self.target_actor, self.actor)  # 硬拷贝
        hard_update(self.target_critic, self.critic)  # 硬拷贝
        return

    def get_exploitation_action(self, state):
        '''得到给定状态下依据目标演员网络计算出的行为,不探索
        Args:
            state numpy数组
        Returns:
            action numpy 数组
        '''
        action = self.target_actor.forward(state).detach()
        return action.data.numpy()

    def get_exploration_action(self, state):
        '''得到给定状态下根据演员网络计算出的带噪声的行为,模拟一定的探索
        Args:
            state numpy数组
        Returns:
            action numpy 数组
        '''
        action = self.actor.forward(state).detach()
        new_action = action.data.numpy() + (self.noise.sample() *
                                            self.action_lim)
        new_action = new_action.clip(min=-1 * self.action_lim,
                                     max=self.action_lim)
        return new_action

    def _learn_from_memory(self):
        '''从记忆学习,更新两个网络的参数
        '''
        # 随机获取记忆里的Transmition
        trans_pieces = self.sample(self.batch_size)
        s0 = np.vstack([x.s0 for x in trans_pieces])
        a0 = np.array([x.a0 for x in trans_pieces])
        r1 = np.array([x.reward for x in trans_pieces])
        # is_done = np.array([x.is_done for x in trans_pieces])
        s1 = np.vstack([x.s1 for x in trans_pieces])

        # 优化评论家网络参数
        a1 = self.target_actor.forward(s1).detach()
        next_val = torch.squeeze(self.target_critic.forward(s1, a1).detach())

        # y_exp = r + gamma*Q'( s2, pi'(s2))

        y_expected = torch.from_numpy(r1).type(
            torch.FloatTensor) + self.gamma * next_val
        y_expected = y_expected.type(torch.FloatTensor)
        # y_pred = Q( s1, a1)
        a0 = torch.from_numpy(a0)  # 转换成Tensor
        y_predicted = torch.squeeze(self.critic.forward(s0, a0))
        # compute critic loss, and update the critic
        loss_critic = F.smooth_l1_loss(y_predicted, y_expected)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # 优化演员网络参数,优化的目标是使得Q增大
        pred_a0 = self.actor.forward(s0)  # 直接使用a0会不收敛
        #反向梯度下降(梯度上升),以某状态的价值估计为策略目标函数
        loss_actor = -1 * torch.sum(self.critic.forward(s0, pred_a0))
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        # 软更新参数
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
        return (loss_critic.item(), loss_actor.item())

    def learning_method(self, display=False, explore=True):
        self.state = np.float64(self.env.reset())
        time_in_episode, total_reward = 0, 0
        is_done = False
        loss_critic, loss_actor = 0.0, 0.0
        while not is_done:
            # add code here
            s0 = self.state
            if explore:
                a0 = self.get_exploration_action(s0)
            else:
                a0 = self.actor.forward(s0).detach().data.numpy()

            s1, r1, is_done, info, total_reward = self.act(a0)
            if display:
                self.env.render()

            if self.total_trans > self.batch_size:
                loss_c, loss_a = self._learn_from_memory()
                loss_critic += loss_c
                loss_actor += loss_a

            time_in_episode += 1
        loss_critic /= time_in_episode
        loss_actor /= time_in_episode
        if display:
            print("{}".format(self.experience.last_episode))
        return time_in_episode, total_reward, loss_critic, loss_actor

    def learning(self, max_episode_num=800, display=False, explore=True):
        total_time, episode_reward, num_episode = 0, 0, 0
        total_times, episode_rewards, num_episodes = [], [], []
        for i in tqdm(range(max_episode_num)):
            time_in_episode, episode_reward, loss_critic, loss_actor = \
                self.learning_method(display=display, explore=explore)
            total_time += time_in_episode
            num_episode += 1
            total_times.append(total_time)
            episode_rewards.append(episode_reward)
            num_episodes.append(num_episode)
            print("episode:{:3}:loss critic:{:4.3f}, J_actor:{:4.3f}".\
                  format(num_episode-1, loss_critic, -loss_actor))
            if explore and num_episode % 100 == 0:
                self.save_models(num_episode)
        return total_times, episode_rewards, num_episodes

    def save_models(self, episode_count):
        torch.save(self.target_actor.state_dict(),
                   './Models/' + str(episode_count) + '_actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './Models/' + str(episode_count) + '_critic.pt')
        print("Models saved successfully")

    def load_models(self, episode):
        self.actor.load_state_dict(
            torch.load('./Models/' + str(episode) + '_actor.pt'))
        self.critic.load_state_dict(
            torch.load('./Models/' + str(episode) + '_critic.pt'))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        print("Models loaded succesfully")
示例#9
0
class DDPG(Algorithm):
    def __init__(self, env):
        super(DDPG, self).__init__()

        pi_net = PiNet(self.ns, self.na)
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na)
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net = QNet(self.ns, self.na)
        self.q_net = q_net.to(self.device)

        q_target = QNet(self.ns, self.na)
        self.q_target = q_target.to(self.device)
        self.load_state_dict(self.q_target, self.q_net.state_dict())

        self.optimizer_q = torch.optim.Adam(self.q_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=1e-2)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=0)

        self.noise = OrnsteinUhlenbeckActionNoise(
            torch.zeros(1, self.na).to(self.device),
            self.epsilon * torch.ones(1, self.na).to(self.device))

    def play(self, env, evaluate=False):

        if env.k == 0:
            self.noise.reset()

        noise = self.noise() if not evaluate else 0

        if self.env_steps >= self.warmup_steps or evaluate:
            with torch.no_grad():
                a = self.pi_net(env.s) + noise
                a = torch.clamp(a, min=-1, max=1)
        else:
            a = None

        state = env(a)
        return state

    def train(self):

        results = defaultdict(lambda: defaultdict(list))

        for i, (s, a, r, t, stag) in tqdm(enumerate(self.sample())):
            i += 1
            self.train_mode()
            self.optimizer_q.zero_grad()
            self.optimizer_p.zero_grad()

            with torch.no_grad():
                pi_tag = self.pi_target(stag)
                q_target = self.q_target(stag, pi_tag)

            g = r + (1 - t) * self.gamma**self.n_steps * q_target

            qa = self.q_net(s, a)
            loss_q = F.mse_loss(qa, g, reduction='mean')

            loss_q.backward()
            if self.clip_q:
                nn.utils.clip_grad_norm(self.q_net.parameters(), self.clip_q)
            self.optimizer_q.step()

            if not i % self.delayed_policy_update:

                pi = self.pi_net(s)

                if self.env_steps >= self.warmup_steps:

                    v = self.q_net(s, pi)
                    loss_p = (-v).mean()
                else:

                    loss_p = F.smooth_l1_loss(pi, a)

                loss_p.backward()
                if self.clip_p:
                    nn.utils.clip_grad_norm(self.pi_net.parameters(),
                                            self.clip_p)
                self.optimizer_p.step()

                results['scalar']['q_est'].append(float(-loss_p))

                soft_update(self.pi_net, self.pi_target, self.tau)

            results['scalar']['loss_q'].append(float(loss_q))

            soft_update(self.q_net, self.q_target, self.tau)

            # if not n % self.target_update:
            #     self.load_state_dict(self.pi_target, self.pi_net.state_dict())
            #     self.load_state_dict(self.q_target, self.q_net.state_dict())

            if not i % self.train_epoch:

                statistics = self.env.get_stats()
                for k, v in statistics.items():
                    for ki, vi in v.items():
                        results[k][ki] = vi

                results['scalar']['rb'] = self.replay_buffer.size
                results['scalar']['env-steps'] = self.env_steps
                results['scalar']['episodes'] = self.episodes
                results['scalar']['train-steps'] = i

                yield results
                results = defaultdict(lambda: defaultdict(list))
示例#10
0
    def __init__(
            self,
            observation_space,
            action_space,
            actor_class=Actor,
            actor_kwargs={},
            critic_class=Critic,
            critic_kwargs={},
            epsilon_initial=1.0,
            epsilon_final=0.01,
            epsilon_steps=10000,
            batch_size=64,
            gamma=0.99,
            beta=0.5,  # averaging factor between off-policy and on-policy targets during n-step updates
            tau_actor=0.001,  # Polyak averaging factor for updating target weights
            tau_critic=0.001,
            replay_memory=None,  # memory buffer object
            replay_memory_size=1000000,
            learning_rate_actor=0.00001,
            learning_rate_critic=0.001,
            initial_memory_threshold=0,
            clip_grad=10,
            adam_betas=(0.95, 0.999),
            use_ornstein_noise=False,
            # if false, uses epsilon-greedy with uniform-random action-parameter exploration
            loss_func=F.mse_loss,  # F.smooth_l1_loss
            inverting_gradients=False,
            n_step_returns=False,
            seed=None,
            buffer_next_actions=False,
            central_critic=False):

        super(PDDPGAgent, self).__init__(observation_space, action_space)

        self.actions_with_param = 3

        self.action_parameter_sizes = np.array([
            self.action_space.spaces[i].shape[0]
            for i in range(1, self.actions_with_param + 1)
        ])
        self.action_parameter_size = int(self.action_parameter_sizes.sum())
        self.action_max = torch.from_numpy(np.ones(
            (self.act_dim, ))).float().to(device)
        self.action_min = -self.action_max.detach()
        self.action_range = (self.action_max - self.action_min).detach()
        self.action_parameter_max_numpy = np.concatenate([
            self.action_space.spaces[i].high
            for i in range(1, self.actions_with_param + 1)
        ]).ravel()
        self.action_parameter_min_numpy = np.concatenate([
            self.action_space.spaces[i].low
            for i in range(1, self.actions_with_param + 1)
        ]).ravel()
        self.action_parameter_range_numpy = (self.action_parameter_max_numpy -
                                             self.action_parameter_min_numpy)
        self.action_parameter_max = torch.from_numpy(
            self.action_parameter_max_numpy).float().to(device)
        self.action_parameter_min = torch.from_numpy(
            self.action_parameter_min_numpy).float().to(device)
        self.action_parameter_range = torch.from_numpy(
            self.action_parameter_range_numpy).float().to(device)

        self.epsilon = epsilon_initial
        self.epsilon_initial = epsilon_initial
        self.epsilon_final = epsilon_final
        self.epsilon_steps = epsilon_steps

        self.clip_grad = clip_grad
        self.batch_size = batch_size
        self.gamma = gamma
        self.beta = beta
        self.replay_memory_size = replay_memory_size
        self.initial_memory_threshold = initial_memory_threshold
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.inverting_gradients = inverting_gradients
        self.tau_actor = tau_actor
        self.tau_critic = tau_critic
        self._step = 0
        self._episode = 0
        self.updates = 0

        self.np_random = None
        self.seed = seed
        self._seed(seed)

        self.buffer_next_actions = buffer_next_actions
        self.use_ornstein_noise = use_ornstein_noise
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.action_parameter_size,
            random_machine=self.np_random,
            mu=0.,
            theta=0.15,
            sigma=0.0001)

        # print(self.act_dim + self.action_parameter_size)
        self.n_step_returns = n_step_returns
        self.loss_func = loss_func  # l1_smooth_loss performs better but original paper used MSE

        self.actor = actor_class(self.obs_dim, self.act_dim,
                                 self.action_parameter_size,
                                 **actor_kwargs).to(device)
        self.actor_target = actor_class(self.obs_dim, self.act_dim,
                                        self.action_parameter_size,
                                        **actor_kwargs).to(device)
        hard_update_target_network(self.actor, self.actor_target)
        self.actor_target.eval()
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.learning_rate_actor,
                                          betas=adam_betas)

        if not central_critic:
            if replay_memory is None:
                self.replay_memory = MemoryNStepReturns(
                    replay_memory_size,
                    observation_space.shape,
                    (self.act_dim + self.action_parameter_size, ),
                    next_actions=self.buffer_next_actions,
                    n_step_returns=self.n_step_returns)
            else:
                self.replay_memory = replay_memory

            self.critic = critic_class(self.obs_dim, self.act_dim,
                                       self.action_parameter_size,
                                       **critic_kwargs).to(device)
            self.critic_target = critic_class(self.obs_dim, self.act_dim,
                                              self.action_parameter_size,
                                              **critic_kwargs).to(device)
            hard_update_target_network(self.critic, self.critic_target)
            self.critic_target.eval()
            self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                               lr=self.learning_rate_critic,
                                               betas=adam_betas)
示例#11
0
class PDDPGAgent(BaseAgent):
    """
    DDPG actor-critic agent for parameterised action spaces
    [Hausknecht and Stone 2016]
    """
    def __init__(
            self,
            observation_space,
            action_space,
            actor_class=Actor,
            actor_kwargs={},
            critic_class=Critic,
            critic_kwargs={},
            epsilon_initial=1.0,
            epsilon_final=0.01,
            epsilon_steps=10000,
            batch_size=64,
            gamma=0.99,
            beta=0.5,  # averaging factor between off-policy and on-policy targets during n-step updates
            tau_actor=0.001,  # Polyak averaging factor for updating target weights
            tau_critic=0.001,
            replay_memory=None,  # memory buffer object
            replay_memory_size=1000000,
            learning_rate_actor=0.00001,
            learning_rate_critic=0.001,
            initial_memory_threshold=0,
            clip_grad=10,
            adam_betas=(0.95, 0.999),
            use_ornstein_noise=False,
            # if false, uses epsilon-greedy with uniform-random action-parameter exploration
            loss_func=F.mse_loss,  # F.smooth_l1_loss
            inverting_gradients=False,
            n_step_returns=False,
            seed=None,
            buffer_next_actions=False,
            central_critic=False):

        super(PDDPGAgent, self).__init__(observation_space, action_space)

        self.actions_with_param = 3

        self.action_parameter_sizes = np.array([
            self.action_space.spaces[i].shape[0]
            for i in range(1, self.actions_with_param + 1)
        ])
        self.action_parameter_size = int(self.action_parameter_sizes.sum())
        self.action_max = torch.from_numpy(np.ones(
            (self.act_dim, ))).float().to(device)
        self.action_min = -self.action_max.detach()
        self.action_range = (self.action_max - self.action_min).detach()
        self.action_parameter_max_numpy = np.concatenate([
            self.action_space.spaces[i].high
            for i in range(1, self.actions_with_param + 1)
        ]).ravel()
        self.action_parameter_min_numpy = np.concatenate([
            self.action_space.spaces[i].low
            for i in range(1, self.actions_with_param + 1)
        ]).ravel()
        self.action_parameter_range_numpy = (self.action_parameter_max_numpy -
                                             self.action_parameter_min_numpy)
        self.action_parameter_max = torch.from_numpy(
            self.action_parameter_max_numpy).float().to(device)
        self.action_parameter_min = torch.from_numpy(
            self.action_parameter_min_numpy).float().to(device)
        self.action_parameter_range = torch.from_numpy(
            self.action_parameter_range_numpy).float().to(device)

        self.epsilon = epsilon_initial
        self.epsilon_initial = epsilon_initial
        self.epsilon_final = epsilon_final
        self.epsilon_steps = epsilon_steps

        self.clip_grad = clip_grad
        self.batch_size = batch_size
        self.gamma = gamma
        self.beta = beta
        self.replay_memory_size = replay_memory_size
        self.initial_memory_threshold = initial_memory_threshold
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.inverting_gradients = inverting_gradients
        self.tau_actor = tau_actor
        self.tau_critic = tau_critic
        self._step = 0
        self._episode = 0
        self.updates = 0

        self.np_random = None
        self.seed = seed
        self._seed(seed)

        self.buffer_next_actions = buffer_next_actions
        self.use_ornstein_noise = use_ornstein_noise
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.action_parameter_size,
            random_machine=self.np_random,
            mu=0.,
            theta=0.15,
            sigma=0.0001)

        # print(self.act_dim + self.action_parameter_size)
        self.n_step_returns = n_step_returns
        self.loss_func = loss_func  # l1_smooth_loss performs better but original paper used MSE

        self.actor = actor_class(self.obs_dim, self.act_dim,
                                 self.action_parameter_size,
                                 **actor_kwargs).to(device)
        self.actor_target = actor_class(self.obs_dim, self.act_dim,
                                        self.action_parameter_size,
                                        **actor_kwargs).to(device)
        hard_update_target_network(self.actor, self.actor_target)
        self.actor_target.eval()
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.learning_rate_actor,
                                          betas=adam_betas)

        if not central_critic:
            if replay_memory is None:
                self.replay_memory = MemoryNStepReturns(
                    replay_memory_size,
                    observation_space.shape,
                    (self.act_dim + self.action_parameter_size, ),
                    next_actions=self.buffer_next_actions,
                    n_step_returns=self.n_step_returns)
            else:
                self.replay_memory = replay_memory

            self.critic = critic_class(self.obs_dim, self.act_dim,
                                       self.action_parameter_size,
                                       **critic_kwargs).to(device)
            self.critic_target = critic_class(self.obs_dim, self.act_dim,
                                              self.action_parameter_size,
                                              **critic_kwargs).to(device)
            hard_update_target_network(self.critic, self.critic_target)
            self.critic_target.eval()
            self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                               lr=self.learning_rate_critic,
                                               betas=adam_betas)

    def __str__(self):
        desc = ("P-DDPG Agent with frozen initial weight layer\n" +
                "Actor: {}\n".format(self.actor) +
                "Critic: {}\n".format(self.critic) +
                "Actor Alpha: {}\n".format(self.learning_rate_actor) +
                "Critic Alpha: {}\n".format(self.learning_rate_critic) +
                "Gamma: {}\n".format(self.gamma) +
                "Tau Actor: {}\n".format(self.tau_actor) +
                "Tau Critic: {}\n".format(self.tau_critic) +
                "Beta: {}\n".format(self.beta) +
                "Inverting Gradients: {}\n".format(self.inverting_gradients) +
                "Replay Memory: {}\n".format(self.replay_memory_size) +
                "epsilon_initial: {}\n".format(self.epsilon_initial) +
                "epsilon_final: {}\n".format(self.epsilon_final) +
                "epsilon_steps: {}\n".format(self.epsilon_steps) +
                "Clip norm: {}\n".format(self.clip_grad) +
                "Batch Size: {}\n".format(self.batch_size) +
                "Ornstein Noise?: {}\n".format(self.use_ornstein_noise) +
                "Seed: {}\n".format(self.seed))
        return desc

    def set_action_parameter_passthrough_weights(self,
                                                 initial_weights,
                                                 initial_bias=None):
        passthrough_layer = self.actor.action_parameters_passthrough_layer
        print(initial_weights.shape)
        print(passthrough_layer.weight.data.size())
        assert initial_weights.shape == passthrough_layer.weight.data.size()
        passthrough_layer.weight.data = torch.Tensor(
            initial_weights).float().to(device)
        if initial_bias is not None:
            print(initial_bias.shape)
            print(passthrough_layer.bias.data.size())
            assert initial_bias.shape == passthrough_layer.bias.data.size()
            passthrough_layer.bias.data = torch.Tensor(
                initial_bias).float().to(device)
        passthrough_layer.requires_grad = False
        passthrough_layer.weight.requires_grad = False
        passthrough_layer.bias.requires_grad = False
        hard_update_target_network(self.actor, self.actor_target)

    def _invert_gradients(self, grad, vals, grad_type, inplace=True):
        # 5x faster on CPU
        if grad_type == "actions":
            max_p = self.action_max.cpu()
            min_p = self.action_min.cpu()
            rnge = self.action_range.cpu()
        elif grad_type == "action_parameters":
            max_p = self.action_parameter_max.cpu()
            min_p = self.action_parameter_min.cpu()
            rnge = self.action_parameter_range.cpu()
        else:
            raise ValueError("Unhandled grad_type: '" + str(grad_type) + "'")

        assert grad.shape == vals.shape

        if not inplace:
            grad = grad.clone()
        with torch.no_grad():
            for n in range(grad.shape[0]):
                # index = grad < 0  # actually > but Adam minimises, so reversed (could also double negate the grad)
                index = grad[n] > 0
                grad[n][index] *= (index.float() * (max_p - vals[n]) /
                                   rnge)[index]
                grad[n][~index] *= ((~index).float() * (vals[n] - min_p) /
                                    rnge)[~index]

        return grad

    def _seed(self, seed=None):
        """
        NOTE: this will not reset the randomly initialised weights; use the seed parameter in the constructor instead.

        :param seed:
        :return:
        """
        self.seed = seed
        random.seed(seed)
        np.random.seed(seed)
        self.np_random = np.random.RandomState(seed=seed)
        if seed is not None:
            torch.manual_seed(seed)
            torch.cuda.manual_seed(seed)

    def _ornstein_uhlenbeck_noise(self, all_action_parameters):
        """ Continuous action exploration using an Ornstein–Uhlenbeck process. """
        return all_action_parameters.data.numpy() + (
            self.noise.sample() * self.action_parameter_range_numpy)

    def end_episode(self):
        self._episode += 1

        # anneal exploration
        if self._episode < self.epsilon_steps:
            self.epsilon = self.epsilon_initial - (
                self.epsilon_initial -
                self.epsilon_final) * (self._episode / self.epsilon_steps)
        else:
            self.epsilon = self.epsilon_final
        pass

    def act(self, obs):
        with torch.no_grad():
            obs = torch.from_numpy(obs).to(device)
            all_actions, all_action_parameters = self.actor.forward(obs)
            all_actions = all_actions.detach().cpu().data.numpy()
            all_action_parameters = all_action_parameters.detach().cpu(
            ).data.numpy()

            # Hausknecht and Stone [2016] use epsilon greedy actions with uniform random action-parameter exploration
            if self.np_random.uniform() < self.epsilon:
                all_actions = self.np_random.uniform(size=all_actions.shape)
                offsets = np.array([
                    self.action_parameter_sizes[i]
                    for i in range(self.actions_with_param)
                ],
                                   dtype=int).cumsum()
                offsets = np.concatenate((np.array([0]), offsets))
                if not self.use_ornstein_noise:
                    for i in range(self.actions_with_param):
                        all_action_parameters[offsets[i]:offsets[
                            i + 1]] = self.np_random.uniform(
                                self.action_parameter_min_numpy[
                                    offsets[i]:offsets[i + 1]],
                                self.action_parameter_max_numpy[
                                    offsets[i]:offsets[i + 1]])

            # select maximum action
            action = np.argmax(all_actions)
            offset = np.array(
                [self.action_parameter_sizes[i] for i in range(action)],
                dtype=int).sum()
            if self.use_ornstein_noise and self.noise is not None:
                all_action_parameters[offset:offset + self.action_parameter_sizes[action]] += \
                    self.noise.sample()[offset:offset + self.action_parameter_sizes[action]]
            if action < self.actions_with_param:
                action_parameters = all_action_parameters[
                    offset:offset + self.action_parameter_sizes[action]]
            else:
                action_parameters = None
        return action, action_parameters, all_actions, all_action_parameters

    # def step(self, state, action, reward, next_state, next_action, terminal, time_steps=1, optimise=True):
    #     action, action_params, all_actions, all_action_parameters = action
    #     self._step += 1

    #     self._add_sample(state, np.concatenate((all_actions.data, all_action_parameters.data)).ravel(), reward,
    #                      next_state, terminal)
    #     if optimise and self._step >= self.batch_size and self._step >= self.initial_memory_threshold:
    #         self.update()

    # def _add_sample(self, state, action, reward, next_state, terminal):
    #     assert not self.n_step_returns
    #     assert len(action) == self.act_dim + self.action_parameter_size
    #     self.replay_memory.append(state, action, reward, next_state, terminal)

    def update(self):
        if self.replay_memory.nb_entries < self.batch_size or \
                self.replay_memory.nb_entries < self.initial_memory_threshold:
            return

        # Sample a batch from replay memory
        if self.n_step_returns:
            obs, actions, rewards, next_obs, terminals, n_step_returns = \
                self.replay_memory.sample(self.batch_size, random_machine=self.np_random)
        else:
            obs, actions, rewards, next_obs, terminals = \
                self.replay_memory.sample(self.batch_size, random_machine=self.np_random)
            n_step_returns = None

        obs = torch.from_numpy(obs).to(device)
        actions_combined = torch.from_numpy(actions).to(
            device)  # make sure to separate actions and action-parameters
        actions = actions_combined[:, :self.act_dim]
        action_parameters = actions_combined[:, self.act_dim:]
        rewards = torch.from_numpy(rewards).to(device)
        next_obs = torch.from_numpy(next_obs).to(device)
        terminals = torch.from_numpy(terminals).to(device)
        if self.n_step_returns:
            n_step_returns = torch.from_numpy(n_step_returns).to(device)

        # ---------------------- optimize critic ----------------------
        with torch.no_grad():
            # use actor_target to predict next action
            pred_next_actions, pred_next_action_parameters = self.actor_target.forward(
                next_obs)
            # use critic_target to predict target value
            off_policy_next_val = \
                self.critic_target.forward(next_obs, pred_next_actions, pred_next_action_parameters)
            off_policy_target = rewards + (
                1 - terminals) * self.gamma * off_policy_next_val

            if self.n_step_returns:
                on_policy_target = n_step_returns
                target = self.beta * on_policy_target + (
                    1. - self.beta) * off_policy_target
            else:
                target = off_policy_target

        y_expected = target
        # use critic to predict actual value
        y_predicted = self.critic.forward(obs, actions, action_parameters)
        loss_critic = self.loss_func(y_predicted, y_expected)

        # update critic
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                           self.clip_grad)
        self.critic_optimizer.step()

        # ---------------------- optimise actor ----------------------
        # 1 - calculate gradients from critic
        with torch.no_grad():
            # use actor to make action (with no grad)
            actions, action_params = self.actor(obs)
            action_params = torch.cat((actions, action_params), dim=1)
        action_params.requires_grad = True

        # use critic and compute its gradients
        Q_val = self.critic(obs, action_params[:, :self.act_dim],
                            action_params[:, self.act_dim:]).mean()
        self.critic.zero_grad()
        Q_val.backward()

        from copy import deepcopy
        delta_a = deepcopy(action_params.grad.data)

        # 2 - apply inverting gradients and combine with gradients from actor
        # use actor to make action again (with grad)
        actions, action_params = self.actor(Variable(obs))
        action_params = torch.cat((actions, action_params), dim=1)

        # invert gradients of actions and parameters separately
        delta_a[:, self.act_dim:] = self._invert_gradients(
            delta_a[:, self.act_dim:].cpu(),
            action_params[:, self.act_dim:].cpu(),
            grad_type="action_parameters",
            inplace=True)
        delta_a[:, :self.act_dim] = self._invert_gradients(
            delta_a[:, :self.act_dim].cpu(),
            action_params[:, :self.act_dim].cpu(),
            grad_type="actions",
            inplace=True)
        out = -torch.mul(delta_a, action_params)
        self.actor.zero_grad()
        out.backward(torch.ones(out.shape).to(device))

        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(),
                                           self.clip_grad)
        self.actor_optimizer.step()

        # ---------------- soft update actor and critic ---------------
        soft_update_target_network(self.actor, self.actor_target,
                                   self.tau_actor)
        soft_update_target_network(self.critic, self.critic_target,
                                   self.tau_critic)