def test():
    env = gym.make(args.env_name)

    env.seed(10)
    torch.manual_seed(10)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    actor = Actor(state_size, action_size, args)
    actor.load_state_dict(torch.load(args.load_model))

    for ep in range(args.test_iter):
        score = 0
        done = False
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        while not done:
            mu, std = actor(torch.Tensor(state))
            action = actor.get_action(mu, std)
            #random_action = env.action_space.sample()
            next_state, reward, done, info = env.step(mu.detach().numpy())
            env.render()

            score += reward
            next_state = np.reshape(next_state, [1, state_size])
            state = next_state
        if ep % args.log_interval == 0:
            print(ep, " ep | score ", score)
    env.close()
Пример #2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, agent_count, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(agent_count * state_size,
                                   agent_count * action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(agent_count * state_size,
                                    agent_count * action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

    def soft_update(self):
        self.soft_update_network(self.critic_local, self.critic_target, TAU)
        self.soft_update_network(self.actor_local, self.actor_target, TAU)

    def soft_update_network(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self, name):
        torch.save(self.actor_local.state_dict(), name + '_actor.pth')
        torch.save(self.critic_local.state_dict(), name + '_critic.pth')

    def load(self, name):
        self.actor_local.load_state_dict(torch.load(name + '_actor.pth'))
        self.critic_local.load_state_dict(torch.load(name + '_critic.pth'))
Пример #3
0
def train_tsp(args):

    # Goals from paper:
    # TSP20, 3.97
    # TSP50, 6.08
    # TSP100, 8.44

    from tasks import tsp
    from tasks.tsp import TSPDataset

    STATIC_SIZE = 2 # (x, y)
    DYNAMIC_SIZE = 1 # dummy for compatibility

    train_data = TSPDataset(args.num_nodes, args.train_size, args.seed)
    valid_data = TSPDataset(args.num_nodes, args.valid_size, args.seed + 1)

    update_fn = None

    actor = Actor(STATIC_SIZE,
                    DYNAMIC_SIZE,
                    args.hidden_size,
                    update_fn,
                    tsp.update_mask,
                    args.num_layers,
                    args.dropout).to(device)

    critic = Critic(STATIC_SIZE, DYNAMIC_SIZE, args.hidden_size).to(device)

    kwargs = vars(args)
    kwargs['train_data'] = train_data
    kwargs['valid_data'] = valid_data
    kwargs['reward_fn'] = tsp.reward
    kwargs['render_fn'] = tsp.render

    if args.checkpoint:
        path = os.path.join(args.checkpoint, 'actor.pt')
        actor.load_state_dict(torch.load(path, device))

        path = os.path.join(args.checkpoint, 'critic.pt')
        critic.load_state_dict(torch.load(path, device))

    if not args.test:
        train(actor, critic, **kwargs)

    test_data = TSPDataset(args.num_nodes, args.train_size, args.seed + 2)

    test_dir = 'test'
    test_loader = DataLoader(test_data, args.batch_size, False, num_workers=0)
    out = validate(test_loader, actor, tsp.reward, tsp.render, test_dir, num_plot=5)

    print('Average tour length: ', out)
Пример #4
0
def init_model(env, model_args, ckpt=None):
    # get input/output size and range
    s_dim = env.get_state_size()
    a_dim = env.get_action_size()
    a_min = env.a_min
    a_max = env.a_max
    a_noise = model_args["noise"] * np.ones(a_dim)

    # get reference memory for FFC
    ref_mem = env._mocap.get_ref_mem()
    if not model_args["with_ffc"]:
        ref_mem.fill(0)
    ref_mem = ref_mem[:, 1:]  # no phase velocity

    # automatically use gpu
    if use_gpu:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')

    from model import Normalizer, Actor, Critic
    GAMMA = file_args["train_args"]["gamma"]
    non_norm = [0]  #FMD0
    s_norm = Normalizer(s_dim, non_norm)
    actor = Actor(s_dim, a_dim, a_min, a_max, a_noise, ref_mem.shape[0])
    critic = Critic(s_dim, 0, 1 / (1 - GAMMA))

    actor.set_reference(ref_mem)
    actor.ref_mem.requires_grad = False

    if (args.ckpt is not None):
        try:
            checkpoint = torch.load(args.ckpt)
            actor.load_state_dict(checkpoint["actor"])
            critic.load_state_dict(checkpoint["critic"])
            s_norm.load_state_dict(checkpoint["s_norm"])
            print("load from %s" % args.ckpt)

        except:
            print("fail to load from %s" % args.ckpt)
            assert (False)

    return s_norm, actor, critic
Пример #5
0
class DDPGTrainer(object):
    def __init__(self):
        self.actor = Actor().to(device)
        self.actor_target = Actor().to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=0.0001)

        self.critic = Critic().to(device)
        self.critic_target = Critic().to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 weight_decay=1e-2)

        self.loss = torch.nn.MSELoss()

    def train(self,
              replay_buffer,
              iterations,
              batch_size=64,
              discount=0.99,
              tau=0.001):

        for it in range(iterations):
            # Sample replay buffer
            smp = replay_buffer.sample(batch_size)
            x, y, u, r, d = smp
            state = torch.FloatTensor(x).to(device)
            action = torch.stack(u)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # Compute the target Q value
            ac = self.actor_target(next_state)
            ac = torch.cat(ac, dim=1)
            target_Q = self.critic_target(next_state, ac)
            target_Q = reward + (done * discount * target_Q).detach()

            # Get current Q estimate
            current_Q = self.critic(state, action)

            # Compute critic loss
            critic_loss = self.loss(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward(retain_graph=True)
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = -self.critic(state, torch.cat(self.actor(state),
                                                       dim=1)).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(tau * param.data +
                                        (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(tau * param.data +
                                        (1 - tau) * target_param.data)
Пример #6
0
def main():
    expert_demo = pickle.load(open('./Ree1_expert.p', "rb"))
    # Ree1 : action 1
    # Ree2 : action 100
    # Ree3 : action 50
    # Ree4 : action 10
    # Ree5 : action 4
    # Ree6 : action 0.5

    # print('expert_demo_shape : ', np.array(expert_demo).shape)
    expert_x = int(expert_demo[1][0])
    expert_y = int(expert_demo[1][1])
    env = Env(expert_x, expert_y)
    # env = Env(0,0)

    # env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = 2
    num_actions = 8
    running_state = ZFilter((num_inputs, ), clip=5)

    print('state size:', num_inputs)
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.learning_rate,
                              weight_decay=args.l2_rate)
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations
    # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))

    demonstrations = np.array(expert_demo[0])

    # print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size:
            state = env.reset()
            score = 0

            state = running_state(state)

            for _ in range(1000):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)
                # next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break

            episodes += 1
            scores.append(score)

        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes,
                                                       score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory,
                                                    discrim_optim,
                                                    demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" %
                  (expert_acc * 100, learner_acc * 100))

            temp_learner.append(learner_acc * 100)
            temp_expert.append(expert_acc * 100)

            if ((expert_acc > args.suspend_accu_exp
                 and learner_acc > args.suspend_accu_gen and iter % 55 == 0)
                    or iter % 50 == 0):
                # train_discrim_flag = False
                plt.plot(temp_learner, label='learner')
                plt.plot(temp_expert, label='expert')
                plt.xlabel('Episode')
                plt.ylabel('Accuracy')
                plt.xticks([])
                plt.legend()
                plt.savefig('accuracy{}.png'.format(iter))
                # plt.show()

                model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
                ckpt_path = os.path.join(model_path,
                                         'ckpt_' + str(score_avg) + '.pth.tar')

                print("check path", ckpt_path)
                save_checkpoint(
                    {
                        'actor': actor.state_dict(),
                        'critic': critic.state_dict(),
                        'discrim': discrim.state_dict(),
                        'z_filter_n': running_state.rs.n,
                        'z_filter_m': running_state.rs.mean,
                        'z_filter_s': running_state.rs.sum_square,
                        'args': args,
                        'score': score_avg
                    },
                    filename=ckpt_path)

        train_actor_critic(actor, critic, memory, actor_optim, critic_optim,
                           args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'discrim': discrim.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
    plt.plot(temp_learner)
    plt.plot(temp_expert)
    plt.xlabel('Episode')
    plt.ylabel('Accuracy')
    plt.xticks([])
    plt.savefig('accuracy.png')
Пример #7
0
class DDPGAgent:
    def __init__(self,
                 plot=True,
                 seed=1,
                 env: gym.Env = None,
                 batch_size=128,
                 learning_rate_actor=0.001,
                 learning_rate_critic=0.001,
                 weight_decay=0.01,
                 gamma=0.999):

        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.batch_size = batch_size
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.weight_decay = weight_decay
        self.gamma = gamma
        self.tau = 0.001

        self._to_tensor = util.to_tensor
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        self.actor = Actor(self.state_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.state_dim,
                                  self.action_dim).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate_actor,
                                                weight_decay=self.weight_decay)

        self.critic = Critic(self.state_dim, self.action_dim).to(self.device)
        self.target_critic = Critic(self.state_dim,
                                    self.action_dim).to(self.device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic.parameters(),
            self.learning_rate_critic,
            weight_decay=self.weight_decay)

        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        self.t = 0

    def _learn_from_memory(self, memory):
        ''' 从记忆学习,更新两个网络的参数
        '''
        # 随机获取记忆里的Transition
        trans_pieces = memory.sample(self.batch_size)
        s0 = np.vstack([x.state for x in trans_pieces])
        a0 = np.vstack([x.action for x in trans_pieces])
        r1 = np.vstack([x.reward for x in trans_pieces])
        s1 = np.vstack([x.next_state for x in trans_pieces])
        terminal_batch = np.vstack([x.is_done for x in trans_pieces])

        # 优化评论家网络参数
        s1 = self._to_tensor(s1, device=self.device)
        s0 = self._to_tensor(s0, device=self.device)

        next_q_values = self.target_critic.forward(
            state=s1, action=self.target_actor.forward(s1)).detach()
        target_q_batch = self._to_tensor(r1, device=self.device) + \
            self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values
        q_batch = self.critic.forward(s0,
                                      self._to_tensor(a0, device=self.device))

        # 计算critic的loss 更新critic网络参数
        loss_critic = F.mse_loss(q_batch, target_q_batch)
        #self.critic_optimizer.zero_grad()
        self.critic.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # 反向传播,以某状态的价值估计为策略目标函数
        loss_actor = -self.critic.forward(s0, self.actor.forward(s0))  # Q的梯度上升
        loss_actor = loss_actor.mean()
        self.actor.zero_grad()
        #self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        # 软更新参数
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
        return (loss_critic.item(), loss_actor.item())

    def learning(self, memory):
        self.actor.train()
        return self._learn_from_memory(memory)

    def save_models(self, episode_count):
        torch.save(self.target_actor.state_dict(),
                   './Models/' + str(episode_count) + '_actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './Models/' + str(episode_count) + '_critic.pt')

    def load_models(self, episode):
        self.actor.load_state_dict(
            torch.load('./Models/' + str(episode) + '_actor.pt'))
        self.critic.load_state_dict(
            torch.load('./Models/' + str(episode) + '_critic.pt'))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        print('Models loaded successfully')
Пример #8
0
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    print('state size:', state_size)
    print('action size:', action_size)

    actor = Actor(state_size, action_size, args)

    if args.load_model is not None:
        pretrained_model_path = os.path.join(os.getcwd(), 'save_model',
                                             str(args.load_model))
        pretrained_model = torch.load(pretrained_model_path)
        actor.load_state_dict(pretrained_model)

    ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma)
    steps = 0

    for episode in range(args.iter):
        done = False
        score = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if args.render:
                env.render()
Пример #9
0
def main(args):

    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    # required to map between integer-value sentences and real sentences
    w2i, i2w = vocab['w2i'], vocab['i2w']

    # make sure our models for the VAE and Actor exist
    if not os.path.exists(args.load_vae):
        raise FileNotFoundError(args.load_vae)

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
    )

    model.load_state_dict(
        torch.load(args.load_vae, map_location=lambda storage, loc: storage))
    model.eval()
    print("vae model loaded from %s"%(args.load_vae))

    # to run in constraint mode, we need the trained generator
    if args.constraint_mode:
        if not os.path.exists(args.load_actor):
            raise FileNotFoundError(args.load_actor)

        actor = Actor(
            dim_z=args.latent_size, dim_model=2048, num_labels=args.n_tags)
        actor.load_state_dict(
            torch.load(args.load_actor, map_location=lambda storage, loc:storage))
        actor.eval()
        print("actor model loaded from %s"%(args.load_actor))

    if torch.cuda.is_available():
        model = model.cuda()
        if args.constraint_mode:
            actor = actor.cuda() # TODO: to(self.devices)

    if args.sample:
        print('*** SAMPLE Z: ***')
        # get samples from the prior
        sample_sents, z = model.inference(n=args.num_samples)
        sample_sents, sample_tags = get_sents_and_tags(sample_sents, i2w, w2i)
        pickle_it(z.cpu().numpy(), 'samples/z_sample_n{}.pkl'.format(args.num_samples))
        pickle_it(sample_sents, 'samples/sents_sample_n{}.pkl'.format(args.num_samples))
        pickle_it(sample_tags, 'samples/tags_sample_n{}.pkl'.format(args.num_samples))
        print(sample_sents, sep='\n')

        if args.constraint_mode:

            print('*** SAMPLE Z_PRIME: ***')
            # get samples from the prior, conditioned via the actor
            all_tags_sample_prime = []
            all_sents_sample_prime = {}
            all_z_sample_prime = {}
            for i, condition in enumerate(LABELS):

                # binary vector denoting each of the PHRASE_TAGS
                labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda()

                # take z and manipulate using the actor to generate z_prime
                z_prime = actor.forward(z, labels)

                sample_sents_prime, z_prime = model.inference(
                    z=z_prime, n=args.num_samples)
                sample_sents_prime, sample_tags_prime = get_sents_and_tags(
                    sample_sents_prime, i2w, w2i)
                print('conditoned on: {}'.format(condition))
                print(sample_sents_prime, sep='\n')
                all_tags_sample_prime.append(sample_tags_prime)
                all_sents_sample_prime[LABEL_NAMES[i]] = sample_sents_prime
                all_z_sample_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy()
            pickle_it(all_tags_sample_prime, 'samples/tags_sample_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_sents_sample_prime, 'samples/sents_sample_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_z_sample_prime, 'samples/z_sample_prime_n{}.pkl'.format(args.num_samples))

    if args.interpolate:
        # get random samples from the latent space
        z1 = torch.randn([args.latent_size]).numpy()
        z2 = torch.randn([args.latent_size]).numpy()
        z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=args.num_samples-2)).float())

        print('*** INTERP Z: ***')
        interp_sents, _ = model.inference(z=z)
        interp_sents, interp_tags = get_sents_and_tags(interp_sents, i2w, w2i)
        pickle_it(z.cpu().numpy(), 'samples/z_interp_n{}.pkl'.format(args.num_samples))
        pickle_it(interp_sents, 'samples/sents_interp_n{}.pkl'.format(args.num_samples))
        pickle_it(interp_tags, 'samples/tags_interp_n{}.pkl'.format(args.num_samples))
        print(interp_sents, sep='\n')

        if args.constraint_mode:
            print('*** INTERP Z_PRIME: ***')
            all_tags_interp_prime = []
            all_sents_interp_prime = {}
            all_z_interp_prime = {}

            for i, condition in enumerate(LABELS):

                # binary vector denoting each of the PHRASE_TAGS
                labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda()

                # z prime conditioned on this particular binary variable
                z_prime = actor.forward(z, labels)

                interp_sents_prime, z_prime = model.inference(
                    z=z_prime, n=args.num_samples)
                interp_sents_prime, interp_tags_prime = get_sents_and_tags(
                    interp_sents_prime, i2w, w2i)
                print('conditoned on: {}'.format(condition))
                print(interp_sents_prime, sep='\n')
                all_tags_interp_prime.append(interp_tags_prime)
                all_sents_interp_prime[LABEL_NAMES[i]] = interp_sents_prime
                all_z_interp_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy()

            pickle_it(all_tags_interp_prime, 'samples/tags_interp_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_sents_interp_prime, 'samples/sents_interp_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_z_interp_prime, 'samples/z_interp_prime_n{}.pkl'.format(args.num_samples))

    import IPython; IPython.embed()
Пример #10
0
# Import the Actor model
from model import Actor

# Create instance of Reacher environment
env = UnityEnvironment(file_name='Reacher-20.app'
                       )  # Update the app name/location if not using macOS

# Get brain

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Load Actor model weights
actor = Actor(state_size=33, action_size=4, seed=0)
actor.load_state_dict(torch.load('checkpoint_actor.pth'))


# Testing
def test(state, agents, action_size):
    """
    Testing the Reacher agent for a single agent

    Params
    ======
        state (numpy.ndarray): Current state that the agents are experiencing
        agents (int):          The number of agents (= 20 in this case)
        action_size (int):     Number of possible actions an agent can take
    """

    global actor
Пример #11
0
class Agent():
    """ Interacts with and learns from the environment """
    def __init__(self, state_size, action_size, num_agents, seed):
        """
        Initialize an Agent object

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents to run
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)

        # Actor network (with target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (with target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.steps_counter = 0
        self.train_counter = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def add(self, state, action, reward, next_state, done):
        """ Save experience to replay memory """
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def learn_from_buffer(self, train_counter):
        for i in range(train_counter):
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """ Returns actions for given state as per current policy """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ 
        Update policy/value parameters using batch of experience tuples

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ############# Update Critic #############
        # Get predicted next-state actions and Q-values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i) ??
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))  # ??

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        ############# Update Actor #############
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ############# Update Target Networks #############
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (pyTorch model): where weights come from
            target_model (pyTorch model): where weights will go
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def set_target_model(self, actor, critic):  # ??
        self.actor_target = actor
        self.critic_target = critic

    def get_target_model(self):
        return self.actor_target, self.critic_target

    def load_weights(self, actor_path, critic_path):
        # Actor
        self.actor_local.load_state_dict(torch.load(actor_path))
        self.actor_target.load_state_dict(torch.load(actor_path))

        # Critic
        self.critic_local.load_state_dict(torch.load(critic_path))
        self.critic_target.load_state_dict(torch.load(critic_path))
Пример #12
0
def val(train_models,
        n_episodes=100000,
        max_t=1000,
        print_every=10,
        eps=0,
        temperature=1,
        actor_state_dict=None):
    log_str = "Start Validating !!!"
    epf.write(log_str)
    epf.flush()
    print(log_str)
    train_env = None

    for train_model in train_models:
        start = datetime.now()
        start_date = "{}".format(start.date())
        start_time = "{}".format(start.time())
        val_actor = Actor(dim_size=dim_size,
                          resource_size=2,
                          n_action_steps=2,
                          action_size=12,
                          h_size=128,
                          num_steps=opt.num_steps).to(device)
        val_actor.load_state_dict(actor_state_dict)

        val_agent = Agent(val_actor, h_size=128, device=device)
        val_agent.set_fitness(opt.fitness)
        val_agent.reset()
        best_score = float("-Inf")
        best_reward_whole = float("-Inf")
        best_constraint = 0.
        best_episode = 0
        best_sol_whole = None
        scores_window = deque(maxlen=print_every)
        scores = []
        episodes = 0
        has_succeed_history = False
        train_m_file = os.path.join(m_file_path, train_model + ".csv")
        train_df = pd.read_csv(train_m_file)
        train_model_defs = train_df.to_numpy()
        if train_env is None:
            train_env = MaestroEnvironment(model_defs=train_model_defs,
                                           dim_size=dim_size,
                                           resource_size=2,
                                           n_action_steps=2,
                                           dataflow=opt.df)
        else:
            train_env.reset_dimension(model_defs=train_model_defs)
        train_env.set_fitness(opt.fitness)
        train_env.set_constraint(opt.cstr)
        constraint_temp = [
            train_env.get_ref_constraint([action_bound[0], action_bound[1]]),
            train_env.get_ref_constraint([action_bottom[0], action_bottom[1]]),
            train_env.get_ref_constraint([action_bound[0], action_bottom[1]]),
            train_env.get_ref_constraint([action_bottom[0], action_bound[1]])
        ]
        max_constraint, min_constraint = max(constraint_temp), min(
            constraint_temp)
        # epf.write("Max constraint: {}\n".format(max_constraint))
        # epf.flush()
        # epf.write("Min constraint: {}\n".format(min_constraint))
        # epf.flush()
        set_constraint = min_constraint + (max_constraint -
                                           min_constraint) * ratio
        train_env.set_constraint_value(max_constraint, min_constraint,
                                       set_constraint)

        theta = {}
        for name, param in val_actor.named_parameters():
            theta[name] = param
        for i_adapt in range(n_episodes):
            score = 0
            # env.shuffle_model()
            state, infos = train_env.reset()
            for _ in range(max_t):
                action, log_prob = val_agent.act(state, infos, eps,
                                                 temperature, theta)
                # print(log_prob)
                # pdb.set_trace()
                next_state, reward, done, infos, sig, impt = train_env.step(
                    action)
                val_agent.step(state, action, log_prob, reward, next_state,
                               done, sig, impt, infos)
                state = next_state
                score += reward
                if done:
                    break

            # inner-gradient
            learned_loss = compute_learned_loss(val_agent, theta)
            if learned_loss is None:
                continue

            inner_gradient = torch.autograd.grad(
                learned_loss,
                [v for _, v in theta.items()],
                create_graph=not first_order,
                retain_graph=not first_order,
                allow_unused=True,
            )

            theta = SGD_step(theta, inner_gradient, INNER_LR)

            env_chkpt = train_env.get_chkpt()
            if env_chkpt["best_sol"] is not None:
                best_sol = env_chkpt["best_sol"]["sol"]
                set_constraint = env_chkpt["ctrs_info"]["value"]
                max_constraint = env_chkpt["ctrs_info"]["max"]
                min_constraint = env_chkpt["ctrs_info"]["min"]
                episode_reward, episode_constraint = train_env.exterior_search(
                    best_sol)
                if episode_reward > best_reward_whole:
                    best_reward_whole = episode_reward
                    best_constraint = episode_constraint
                    best_episode = i_adapt
                    best_sol_whole = best_sol

        val_agent.reset()
        log_str = f'RL result: {train_model} at {best_episode} reward: {best_reward_whole:.4e}, Used Constraint: {best_constraint/train_env.constraint_value * 100:.1f}%\n'
        print(log_str)
        epf.write(log_str)
        epf.flush()

        end = datetime.now()
        end_date = "{}".format(end.date())
        end_time = "{}".format(end.time())
        log_str = 'Running time: from {} to {}'.format(start_time, end_time)
        print(log_str)
        epf.write(log_str)
        epf.flush()

        if model2sol[train_model][
                'best_reward'] is None or best_reward_whole > model2sol[
                    train_model]['best_reward']:
            model2sol[train_model]['best_reward'] = best_reward_whole
            model2sol[train_model]['best_sol'] = best_sol_whole

        for f in glob.glob("*.m"):
            os.remove(f)
        for f in glob.glob("*.csv"):
            os.remove(f)
Пример #13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic1_local = Critic(state_size, action_size).to(device)
        self.critic1_target = Critic(state_size, action_size).to(device)
        self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        self.critic2_local = Critic(state_size, action_size).to(device)
        self.critic2_target = Critic(state_size, action_size).to(device)
        self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = PER(BUFFER_SIZE)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory."""
        # Set reward as initial priority, see:
        #   https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/
        self.memory.add((state, action, reward, next_state, done), reward)

    def act(self, state):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action += self.noise.sample()
        return np.clip(action, -1., 1.)

    def reset(self):
        self.noise.reset()

    def mse(self, expected, targets, is_weights):
        """Custom loss function that takes into account the importance-sampling weights."""
        td_error = expected - targets
        weighted_squared_error = is_weights * td_error * td_error
        return torch.sum(weighted_squared_error) / torch.numel(
            weighted_squared_error)

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        """
        for i in range(1, LEARN_BATCH + 1):
            idxs, experiences, is_weights = self.memory.sample(BATCH_SIZE)

            states = torch.from_numpy(
                np.vstack([e[0] for e in experiences
                           if e is not None])).float().to(device)
            actions = torch.from_numpy(
                np.vstack([e[1] for e in experiences
                           if e is not None])).float().to(device)
            rewards = torch.from_numpy(
                np.vstack([e[2] for e in experiences
                           if e is not None])).float().to(device)
            next_states = torch.from_numpy(
                np.vstack([e[3] for e in experiences
                           if e is not None])).float().to(device)
            dones = torch.from_numpy(
                np.vstack([e[4] for e in experiences if e is not None
                           ]).astype(np.uint8)).float().to(device)

            is_weights = torch.from_numpy(is_weights).float().to(device)

            # ---------------------------- update critic ---------------------------- #
            # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action
            if POLICY_NOISE > 0.0:
                noise = torch.empty_like(actions).data.normal_(
                    0, POLICY_NOISE).to(device)
                noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP)
                # Get predicted next-state actions and Q values from target models
                actions_next = (self.actor_target(next_states) + noise).clamp(
                    -1., 1.)
            else:
                # Get predicted next-state actions and Q values from target models
                actions_next = self.actor_target(next_states)

            # Error Mitigation
            Q_targets_next = torch.min(\
                self.critic1_target(next_states, actions_next), \
                self.critic2_target(next_states, actions_next)).detach()

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

            # Compute critic1 loss
            Q_expected = self.critic1_local(states, actions)
            errors1 = np.abs((Q_expected - Q_targets).detach().cpu().numpy())
            critic1_loss = self.mse(Q_expected, Q_targets, is_weights)
            # Minimize the loss
            self.critic1_optimizer.zero_grad()
            critic1_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1)
            self.critic1_optimizer.step()

            # Update priorities in the replay buffer
            self.memory.batch_update(idxs, errors1)

            # Compute critic2 loss
            Q_expected = self.critic2_local(states, actions)
            critic2_loss = self.mse(Q_expected, Q_targets, is_weights)
            # Minimize the loss
            self.critic2_optimizer.zero_grad()
            critic2_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1)
            self.critic2_optimizer.step()

            # Delayed Policy Updates
            if i % UPDATE_ACTOR_EVERY == 0:
                # ---------------------------- update actor ---------------------------- #
                # Compute actor loss
                actions_pred = self.actor_local(states)
                actor_loss = -self.critic1_local(states, actions_pred).mean()
                # Minimize the loss
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # ----------------------- update target networks ----------------------- #
                self.soft_update(self.critic1_local, self.critic1_target, TAU)
                self.soft_update(self.critic2_local, self.critic2_target, TAU)
                self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_weights(self):
        torch.save(self.actor_local.state_dict(), actor_weights_file)
        torch.save(self.critic1_local.state_dict(), critic1_weights_file)
        torch.save(self.critic2_local.state_dict(), critic2_weights_file)

    def load_weights(self):
        self.actor_local.load_state_dict(torch.load(actor_weights_file))
        self.critic1_local.load_state_dict(torch.load(critic1_weights_file))
        self.critic2_local.load_state_dict(torch.load(critic2_weights_file))
Пример #14
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, identity, state_size, action_size, random_seed, memory,
                 noise):
        # checked
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.checkpoint_file_path = './checkpoint_agent_' + str(
            identity) + '.pth'
        if os.path.isfile(self.checkpoint_file_path):
            self.actor_local.load_state_dict(
                torch.load(self.checkpoint_file_path))
            self.actor_target.load_state_dict(
                torch.load(self.checkpoint_file_path))

        # Replay memory
        self.memory = memory

        # Noise process
        self.noise = noise

    def act(self, state, add_noise=True):
        # checked
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        # checked
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def reset(self):
        # checked

        self.noise.reset()
Пример #15
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_method':args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Пример #16
0
class DDPG():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, num_agents,
                 agent_id):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents
        self.agent_id = agent_id
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * EPOCHS)
        self.timestep = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size * 2, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size * 2, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * 2, action_size * 2,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * 2, action_size * 2,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.timestep += 1
        priority = (abs(reward) + PRIORITY_EPS)**PRIORITY_ALPHA
        self.memory.add(state, action, reward, next_state, done, priority)

        if self.timestep % UPDATE_EVERY != 0:
            return

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            for i in range(EPOCHS):
                experiences = self.memory.sample(device)
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        state = torch.from_numpy(state).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # get action for each agent and concatenate them
            actions = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Construct next actions vector relative to the agent
        if self.agent_id == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Construct action prediction vector relative to each agent
        if self.agent_id == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, EPS_FINAL)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self):
        torch.save(self.actor_local.state_dict(),
                   'actor{}.pth'.format(self.agent_id))
        torch.save(self.critic_local.state_dict(),
                   'critic{}.pth'.format(self.agent_id))

    def load(self):
        self.actor_local.load_state_dict(
            torch.load('actor{}.pth'.format(self.agent_id)))
        self.critic_local.load_state_dict(
            torch.load('critic{}.pth'.format(self.agent_id)))
Пример #17
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Пример #18
0
def test(n_episodes=100000,
         max_t=1000,
         print_every=10,
         eps=0,
         temperature=1,
         actor_state_dict=None):
    log_str = "Start Testing !!!"
    epf.write(log_str)
    epf.flush()
    print(log_str)

    Test_LR_ACTOR = 1e-3

    test_actor = Actor(dim_size=dim_size,
                       resource_size=2,
                       n_action_steps=2,
                       action_size=12,
                       h_size=128,
                       num_steps=opt.num_steps).to(device)
    test_actor.load_state_dict(actor_state_dict)
    test_actor_optimizer = optim.Adam(test_actor.parameters(),
                                      lr=Test_LR_ACTOR)
    test_scheduler = optim.lr_scheduler.ReduceLROnPlateau(test_actor_optimizer,
                                                          factor=0.9,
                                                          min_lr=1e-6)

    test_agent = Agent(test_actor, h_size=128, device=device)
    test_agent.set_fitness(opt.fitness)
    test_agent.reset()

    best_score = float("-Inf")
    best_reward_whole = float("-Inf")
    best_constraint = 0.
    best_episode = 0
    scores_window = deque(maxlen=print_every)
    scores = []
    episodes = 0
    has_succeed_history = False

    model = test_model
    m_file = os.path.join(m_file_path, model + ".csv")
    df = pd.read_csv(m_file)
    model_defs = df.to_numpy()

    env = MaestroEnvironment(model_defs=model_defs,
                             dim_size=dim_size,
                             resource_size=2,
                             n_action_steps=2,
                             dataflow=opt.df)
    state = env.reset()
    env.set_fitness(opt.fitness)
    env.set_constraint(opt.cstr)
    constraint_temp = [
        env.get_ref_constraint([action_bound[0], action_bound[1]]),
        env.get_ref_constraint([action_bottom[0], action_bottom[1]]),
        env.get_ref_constraint([action_bound[0], action_bottom[1]]),
        env.get_ref_constraint([action_bottom[0], action_bound[1]])
    ]
    max_constraint, min_constraint = max(constraint_temp), min(constraint_temp)
    # epf.write("Max constraint: {}\n".format(max_constraint))
    # epf.flush()
    # epf.write("Min constraint: {}\n".format(min_constraint))
    # epf.flush()
    set_constraint = min_constraint + (max_constraint - min_constraint) * ratio

    env.set_constraint_value(max_constraint, min_constraint, set_constraint)
    # epf.write("Set constraint: {}\n".format(set_constraint))
    # epf.flush()

    params_list = [get_params(test_actor, device=device)]
    temp_params = params_list[-1]
    # print(state, infos, eps,temperature)
    for i_adapt in range(n_episodes):
        score = 0
        # env.shuffle_model()
        state, infos = env.reset()
        for _ in range(max_t):
            action, log_prob = test_agent.act(state, infos, eps, temperature,
                                              temp_params)
            # print(log_prob)
            # pdb.set_trace()
            next_state, reward, done, infos, sig, impt = env.step(action)
            test_agent.step(state, action, log_prob, reward, next_state, done,
                            sig, impt, infos)
            state = next_state
            score += reward
            if done:
                break

        ## inner-gradient
        # learned_loss = compute_loss(test_agent, GAMMA, impt,infos)
        learned_loss = compute_learned_loss(test_agent, temp_params)

        inner_gradient = torch.autograd.grad(
            learned_loss,
            [v for _, v in params_list[i_adapt].items()],
            create_graph=not first_order,
            retain_graph=not first_order,
            allow_unused=True,
        )

        params_list.append(
            SGD_step(params_list[i_adapt],
                     inner_gradient,
                     INNER_LR,
                     testing=True))
        temp_params = params_list[-1]

        # for i_episode in range(0, n_episodes):
        #     if (i_episode+1) %100 ==0  and has_succeed_history:
        #         eps /= 1.2
        #         temperature /=1.01
        #         temperature = max(temperature,1)
        #         ajust_lr(ratio=0.8, actor_optimizer=test_actor_optimizer, min_lr=1e-6)

        #     # print(state, infos, eps,temperature)

        #     temp_params = get_params(test_actor, device=device)
        #     score = 0
        #     # env.shuffle_model()
        #     state, infos = env.reset()
        #     for t in range(max_t):
        #         action, log_prob = test_agent.act(state, infos, eps,temperature, temp_params)
        #         # print(log_prob)
        #         # pdb.set_trace()
        #         next_state, reward, done, infos, sig, impt= env.step(action)
        #         test_agent.step(state, action, log_prob, reward, next_state, done, sig, impt, infos)
        #         state = next_state
        #         score += reward
        #         if done:
        #             break

        #     if done and sig:
        #         loss = compute_loss(test_agent, GAMMA, impt, infos)
        #         test_actor_optimizer.zero_grad()
        #         # policy_loss.backward()
        #         meta_gradient = torch.autograd.grad(
        #             loss,
        #             [v for _, v in temp_params.items()],
        #             allow_unused=True,
        #             )
        #         transfer_gradient_to_shared(meta_gradient, test_actor, device)
        #         torch.nn.utils.clip_grad_norm_(test_actor.parameters(), CLIPPING_MODEL)
        #         torch.nn.utils.clip_grad_norm_(test_actor.lstm.parameters(), CLIPPING_LSTM)
        #         test_actor_optimizer.step()
        #         test_agent.reset()

        scores_window.append(score)
        scores.append(score)
        if np.mean(scores_window) > best_score:
            best_score = np.mean(scores_window)
        env_chkpt = env.get_chkpt()

        if infos["succeed"]:
            has_succeed_history = True

        if env_chkpt["best_sol"] is not None:
            best_sol = env_chkpt["best_sol"]["sol"]
            set_constraint = env_chkpt["ctrs_info"]["value"]
            max_constraint = env_chkpt["ctrs_info"]["max"]
            min_constraint = env_chkpt["ctrs_info"]["min"]
            episode_reward, episode_constraint = env.exterior_search(best_sol)
        else:
            episode_reward = float("-Inf")
            episode_constraint = 0.

        if episode_reward > best_reward_whole:
            best_reward_whole = episode_reward
            best_constraint = episode_constraint
            best_episode = i_adapt

    # for f in glob.glob("*.m"):
    #     os.remove(f)
    # for f in glob.glob("*.csv"):
    #     os.remove(f)
    print(best_episode)
    return best_reward_whole, best_constraint / env.constraint_value * 100
Пример #19
0
    print('state size:', num_inputs)
    print('action size:', num_actions)

    writer = SummaryWriter(args.logdir)

    actor = Actor(num_inputs, num_actions)
    critic = Critic(num_inputs)

    running_state = ZFilter((num_inputs, ), clip=5)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=hp.critic_lr,
                              weight_decay=hp.l2_rate)

    episodes = 0
    for iter in range(15000):
Пример #20
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 random_seed=0):
        """
            Initialize an Agent object.
        :param state_size: size of state
        :param action_size: size of action
        :param num_agents: number of agents
        :param gamma: discount factor
        :param tau: factor for soft update of target parameters
        :param lr_actor: Learning rate of actor
        :param lr_critic: Learning rate of critic
        :param random_seed: Random seed
        :param device: cuda or cpu
        """

        self.device = device
        self.gamma = gamma
        self.tau = tau

        self.num_agents = num_agents

        self.state_size = state_size
        self.action_size = action_size
        self.full_state_size = state_size * num_agents
        self.full_action_size = action_size * num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, device,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, device,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.full_state_size,
                                   self.full_action_size,
                                   device=device,
                                   random_seed=random_seed).to(device)
        self.critic_target = Critic(self.full_state_size,
                                    self.full_action_size,
                                    device=device,
                                    random_seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

    def save_model(self, agent_number):
        torch.save(self.actor_local.state_dict(),
                   f'models/checkpoint_actor_{agent_number}.pth')
        torch.save(self.critic_local.state_dict(),
                   f'models/checkpoint_critic_{agent_number}.pth')

    def load_model(self, agent_number):
        checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.actor_local.load_state_dict(checkpoint)

        checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.critic_local.load_state_dict(checkpoint)

    def act(self, state, noise=0., train=False):
        """Returns actions for given state as per current policy.
        :param state: state as seen from single agent
        """

        if train is True:
            self.actor_local.train()
        else:
            self.actor_local.eval()

        action = self.actor_local(state)
        if noise > 0:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def target_act(self, state, noise=0.):
        #self.actor_target.eval()
        # convert to cpu() since noise is in cpu()
        self.actor_target.eval()
        action = self.actor_target(state).cpu()
        if noise > 0.:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def update_critic(self, rewards, dones, all_states, all_actions,
                      all_next_states, all_next_actions):
        with torch.no_grad():
            Q_targets_next = self.critic_target(all_next_states,
                                                all_next_actions)
            # Compute Q targets for current states (y_i)
        q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        q_expected = self.critic_local(all_states, all_actions)
        # critic_loss = F.mse_loss(q_expected, q_targets)
        critic_loss = ((q_expected - q_targets.detach())**2).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, all_states, all_predicted_actions):
        """Update actor network

        :param all_states: all states
        :param all_predicted_actions: all predicted actions
        """
        actor_loss = -self.critic_local(all_states,
                                        all_predicted_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_optimizer.step()

    def update_targets(self):
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        self.soft_update(self.critic_local, self.critic_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        self.noise.reset()
Пример #21
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=256,
                 learn_every=1,
                 update_every=1,
                 gamma=0.99,
                 tau=0.02,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 random_seed=None,
                 use_asn=True,
                 asn_kwargs={},
                 use_psn=False,
                 psn_kwargs={},
                 use_per=False,
                 restore=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.learn_every = learn_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        # Keep track of how many times we've updated weights
        self.i_updates = 0
        self.i_step = 0
        self.use_asn = use_asn
        self.use_psn = use_psn
        self.use_per = use_per

        if random_seed is not None:
            random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        if self.use_psn:
            self.actor_perturbed = Actor(state_size, action_size).to(device)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=device)
            self.actor_local.load_state_dict(checkpoint[0]['actor'])
            self.actor_target.load_state_dict(checkpoint[0]['actor'])
            if self.use_psn:
                self.actor_perturbed.load_state_dict(checkpoint[0]['actor'])
            self.critic_local.load_state_dict(checkpoint[0]['critic'])
            self.critic_target.load_state_dict(checkpoint[0]['critic'])

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Hard copy weights from local to target networks
        policy_update(self.actor_local, self.actor_target, 1.0)
        policy_update(self.critic_local, self.critic_target, 1.0)

        # Noise process
        if self.use_asn:
            self.action_noise = OUNoise(action_size, **asn_kwargs)

        if self.use_psn:
            self.param_noise = ParameterSpaceNoise(**psn_kwargs)

        if self.use_per:
            self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size,
                                                      random_seed)
        else:
            self.buffer = ExperienceReplay(buffer_size, batch_size,
                                           random_seed)

    def act(self, states, perturb_mode=True, train_mode=True):
        """Returns actions for given state as per current policy."""
        if not train_mode:
            self.actor_local.eval()
            if self.use_psn:
                self.actor_perturbed.eval()

        with torch.no_grad():
            states = torch.from_numpy(states).float().to(device)
            actor = self.actor_perturbed if (
                self.use_psn and perturb_mode) else self.actor_local
            actions = actor(states).cpu().numpy()[0]

        if train_mode:
            actions += self.action_noise.sample()

        self.actor_local.train()
        if self.use_psn:
            self.actor_perturbed.train()

        return np.clip(actions, -1, 1)

    def perturb_actor_parameters(self):
        """Apply parameter space noise to actor model, for exploration"""
        policy_update(self.actor_local, self.actor_perturbed, 1.0)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            random = torch.randn(param.shape)
            if use_cuda:
                random = random.cuda()
            param += random * self.param_noise.current_stddev

    def reset(self):
        self.action_noise.reset()
        if self.use_psn:
            self.perturb_actor_parameters()

    def step(self, experience, priority=0.0):
        self.buffer.push(experience)
        self.i_step += 1
        if len(self.buffer) > self.batch_size:
            if self.i_step % self.learn_every == 0:
                self.learn(priority)
            if self.i_step % self.update_every == 0:
                self.update(
                )  # soft update the target network towards the actual networks

    def learn(self, priority=0.0):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        if self.use_per:
            (states, actions, rewards, states_next,
             dones), batch_idx = self.buffer.sample(priority)
        else:
            states, actions, rewards, states_next, dones = self.buffer.sample()

        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            actions_next = self.actor_target(states_next)
            Q_targets_next = self.critic_target(states_next, actions_next)
            Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # ---------------------------- update critic ---------------------------- #
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_local.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_local.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        if self.use_per:
            Q_error = Q_expected - Q_targets
            new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy()
            self.buffer.update_deltas(batch_idx, new_deltas)

    def update(self):
        """soft update targets"""
        self.i_updates += 1
        policy_update(self.actor_local, self.actor_target, self.tau)
        policy_update(self.critic_local, self.critic_target, self.tau)

    def save_model(self, model_dir, session_name, i_episode, best):

        filename = os.path.join(
            model_dir,
            f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt')
        filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt')
        save_dict_list = []
        save_dict = {
            'actor': self.actor_local.state_dict(),
            'actor_optim_params': self.actor_optimizer.state_dict(),
            'critic': self.critic_local.state_dict(),
            'critic_optim_params': self.critic_optimizer.state_dict()
        }
        save_dict_list.append(save_dict)
        torch.save(save_dict_list, filename)
        copyfile(filename, filename_best)

    def postprocess(self, t_step):
        if self.use_psn and t_step > 0:
            perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail(
                t_step)
            unperturbed_actions = self.act(np.array(perturbed_states), False,
                                           False)
            diff = np.array(perturbed_actions) - unperturbed_actions
            mean_diff = np.mean(np.square(diff), axis=0)
            dist = sqrt(np.mean(mean_diff))
            self.param_noise.adapt(dist)
Пример #22
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            "hidden1": args.hidden1,
            "hidden2": args.hidden2,
            "init_w": args.init_w,
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(
            self.actor_target, self.actor
        )  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(
            limit=args.rmsize, window_length=args.window_length
        )
        self.random_process = OrnsteinUhlenbeckProcess(
            size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma
        )

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_batch,
        ) = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ]
        )
        # next_q_values.volatile = False

        target_q_batch = (
            to_tensor(reward_batch)
            + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values
        )

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch), self.actor(to_tensor(state_batch))]
        )

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.0, 1.0, self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1.0, 1.0)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output)))

        self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output))
        torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Пример #23
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 seed,
                 fc1=400,
                 fc2=300,
                 update_times=10):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.num_agents = num_agents
        self.update_times = update_times

        self.noise = []
        for i in range(num_agents):
            self.noise.append(
                rm.OrnsteinUhlenbeckProcess(size=(action_size, ),
                                            std=LinearSchedule(0.2)))

        # critic local and target network (Q-Learning)
        self.critic_local = Critic(state_size, action_size, fc1, fc2,
                                   seed).to(device)

        self.critic_target = Critic(state_size, action_size, fc1, fc2,
                                    seed).to(device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # actor local and target network (Policy gradient)
        self.actor_local = Actor(state_size, action_size, fc1, fc2,
                                 seed).to(device)
        self.actor_target = Actor(state_size, action_size, fc1, fc2,
                                  seed).to(device)
        self.actor_target.load_state_dict(self.actor_local.state_dict())

        # optimizer for critic and actor network
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.a_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        for i in range(self.num_agents):
            self.memory.add(state[i], action[i], reward[i], next_state[i],
                            done[i])

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if self.t_step == 0:

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for i in range(self.update_times):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, training=True):
        """Returns continous actions values for all action for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
        """

        state = torch.from_numpy(state).float().detach().to(device)
        #print(state.shape,"act")

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state)
        self.actor_local.train()

        noise = []
        for i in range(self.num_agents):
            noise.append(self.noise[i].sample())

        return np.clip(actions.cpu().data.numpy() + np.array(noise), -1, 1)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        next_actions = self.actor_target(next_states)
        with torch.no_grad():
            Q_target_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards + (gamma * Q_target_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        #critic loss
        loss = F.mse_loss(Q_expected, Q_targets.detach())

        self.optimizer_critic.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.optimizer_critic.step()

        #actor loss

        action_pr = self.actor_local(states)
        p_loss = -self.critic_local(states, action_pr).mean()

        self.optimizer_actor.zero_grad()
        p_loss.backward()

        self.optimizer_actor.step()

        # ------------------- update target network ------------------- #

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset_random(self):
        for i in range(self.num_agents):
            self.noise[i].reset_states()
Пример #24
0
class Agent():
    def __init__(self, test=False):
        # device
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        #########################################
        """
        Some hand tune config(for developing)
        """
        self.discrete = False
        self.action_dim = 1
        self.state_dim = 3
        self.batch_size = 100
        self.action_low = -2
        self.action_high = 2
        ##########################################
        self.P_online = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target.load_state_dict(self.P_online.state_dict())
        self.Q_online = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target.load_state_dict(self.Q_online.state_dict())
        # discounted reward
        self.gamma = 0.99
        self.eps = 0.25
        # optimizer
        self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(),
                                            lr=1e-3)
        self.p_optimizer = torch.optim.Adam(self.P_online.parameters(),
                                            lr=1e-3)
        # saved rewards and actions
        self.replay_buffer = ReplayBuffer()

        # noise
        self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        # Initialize noise
        self.ou_level = 0.

        self.ep_step = 0

    def act(self, state, test=False):
        if not test:
            with torch.no_grad():
                # boring type casting
                state = ((torch.from_numpy(state)).unsqueeze(0)).float().to(
                    self.device)
                action = self.P_online(state)  # continuous output
                a = action.data.cpu().numpy()
                # if self.ep_step < 200:
                # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
                # a = a + self.ou_level
                if self.discrete:
                    action = np.argmax(a)
                    return a, action
                else:
                    if self.ep_step < 200:
                        self.ou_level = self.noise.ornstein_uhlenbeck_level(
                            self.ou_level)
                    action = np.clip(a + self.ou_level, self.action_low,
                                     self.action_high)
                    return action, action

    def collect_data(self, state, action, reward, next_state, done):
        self.replay_buffer.push(
            torch.from_numpy(state).float().unsqueeze(0),
            torch.from_numpy(action).float(),
            torch.tensor([reward]).float().unsqueeze(0),
            torch.from_numpy(next_state).float().unsqueeze(0),
            torch.tensor([done]).float().unsqueeze(0))

    def clear_data(self):
        raise NotImplementedError("Circular Queue don't need this function")

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size=self.batch_size, device=self.device)
        # discounted rewards
        # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device)

        ### debug shape : ok
        #===============================Critic Update===============================
        self.Q_online.train()
        Q = self.Q_online((states, actions))

        with torch.no_grad():  # don't need backprop for target value
            self.Q_target.eval()
            self.P_target.eval()
            target = rewards + self.gamma * (1 - dones) * self.Q_target(
                (next_states, self.P_target(next_states)))
        critic_loss_fn = torch.nn.MSELoss()
        critic_loss = critic_loss_fn(Q, target).mean()
        # update
        self.q_optimizer.zero_grad()
        critic_loss.backward()
        self.q_optimizer.step()
        # print("critic loss", critic_loss.item())

        #===============================Actor Update===============================
        # fix online_critic , update online_actor
        self.Q_online.eval()
        for p in self.Q_online.parameters():
            p.requires_grad = False
        for p in self.P_online.parameters():
            p.requires_grad = True
        policy_loss = -self.Q_online((states, self.P_online(states)))
        policy_loss = policy_loss.mean()
        self.p_optimizer.zero_grad()
        policy_loss.backward()
        self.p_optimizer.step()
        # print("policy loss", policy_loss.item())
        for p in self.Q_online.parameters():
            p.requires_grad = True
        #===============================Target Update===============================
        soft_update(self.Q_target, self.Q_online, tau=1e-3)
        soft_update(self.P_target, self.P_online, tau=1e-3)
        self.eps -= EPSILON_DECAY
        if self.eps <= 0:
            self.eps = 0
Пример #25
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if(self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if(self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
#        if self.pic:
#            action = np.concatenate((softmax(action[:16]), softmax(action[16:])))
        return action
        
    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(
                self.actor_target(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = self.random_action(fix=True) # episilon greedy            

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action
        
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
Пример #26
0
class ddpg_Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, env, config):
        """Initialize an Agent object.
        
        Params
        ======
            env : environment to be handled
            config : configuration given a variety of parameters
        """
        self.env = env
        self.config = config

        # set parameter for ML
        self.set_parameters(config)
        # Q-Network
        self.create_networks()
        # Noise process
        self.noise = OUNoise(self.action_size, self.seed)
        # Replay memory
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed)
    
    def set_parameters(self, config):
        # Base agent parameters
        self.gamma = config['gamma']                    # discount factor 
        self.tau = config['tau']
        self.max_episodes = config['max_episodes']      # max numbers of episdoes to train
        self.env_file_name = config['env_file_name']    # name and path for env app
        self.brain_name = config['brain_name']          # name for env brain used in step
        self.num_agents = config['num_agents']
        self.state_size = config['state_size']
        self.action_size = config['action_size']
        self.hidden_size = config['hidden_size']
        self.buffer_size = config['buffer_size']
        self.batch_size = config['batch_size']
        self.dropout = config['dropout']
        self.critic_learning_rate = config['critic_learning_rate']
        self.actor_learning_rate = config['actor_learning_rate']
        self.seed = (config['seed'])
        self.noise_scale = 1
        self.noise_sigma = 0.1
        # Some debug flags
        self.DoDebugEpisodeLists = False        
        
    def create_networks(self):
        # Actor Network (local & Target Network)
        self.actor_local = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device)
        self.actor_target = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_learning_rate)

        # Critic Network (local & Target Network)
        self.critic_local = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device)
        self.critic_target = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_learning_rate)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        # print('step : Next States : ',next_state.shape)
        self.memory.add(state, action, reward, next_state, done)
        # print('New step added to memory, length : ',len(self.memory))

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)
            
    def update_noise_scale(self, cur_reward, scale_min = 0.2, scale_noise=False):
        """ If scale_noise == True the self.noise_scale will be decreased in relation to rewards
            Currently hand coded  as rewards go up noise_scale will go down from 1 to scale_min"""
        
        if scale_noise:
            rewlow = 2 # below rewlow noise_scale is 1 from there on it increases linearly down to scale_min + 0.5*(1 - scale_min) until rewhigh is reached
            rewhigh = 10 # above rewhigh noise_scale falls linearly down to scale_min until rewrd = 30 is reached. Beyond 30 it stays at scale_min
            if cur_reward > rewlow:
                if cur_reward < rewhigh:
                    self.noise_scale = (1 - scale_min)*(0.5*(rewhigh-cur_reward)/(rewhigh - rewlow) + 0.5) + scale_min
                else:
                    self.noise_scale = (1 - scale_min)*np.min(0.5*(30-cur_reward)/((30-rewhigh)),0) + scale_min
                    
            print('Updated noise scale to : ',self.noise_scale)
                
        return                    
        

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = ten(state)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise_scale * self.noise.sample()
            # ToDo check if tanh works better
        return np.clip(action, -1, 1)

    def train(self):
        if False:
            filename = 'trained_reacher_a_e100.pth'
            self.load_agent(filename)
        all_rewards = []
        reward_window = deque(maxlen=100)
        print('Running on device : ',device)
        for i_episode in range(self.max_episodes): 
            tic = time.time()
            # Reset the enviroment
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            total_reward = np.zeros(self.num_agents)
            t = 0
            done = np.zeros(self.num_agents, dtype = bool)

            # loop over episode time steps
            while all(done==False): #  t < self.tmax:
                # act and collect data
                action = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = np.asarray(env_info.rewards)
                done = np.asarray(env_info.local_done)
                # np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
                # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,action,reward,done))
                # increment stuff
                t += 1
                total_reward += reward
                # Proceed agent step
                self.step(state, action, reward, next_state, done)
                # prepare for next round
                state = next_state
            # while not done
            # keep track of rewards:
            all_rewards.append(np.mean(total_reward))
            reward_window.append(np.mean(total_reward))
            
            # Output Episode info : 
            toc = time.time()
            if (i_episode == 100):
                self.stable_update()
            self.update_noise_scale(np.mean(reward_window))
            if not (i_episode % 25 == 0):
                print('Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f} || Used {:5.3f} seconds, mem : {}'.format(i_episode,np.mean(total_reward),np.mean(reward_window),toc-tic,len(self.memory)))
            else:
                print(Back.RED + 'Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f}'.format(i_episode,np.mean(total_reward),np.mean(reward_window)))
                print(Style.RESET_ALL)
                
            if (i_episode % 50 == 0):
                self.save_agent(i_episode)
        # for i_episode
            
        return all_rewards

    def reset(self):
        self.noise.reset()
        
    def stable_update(self):
        """ Update Hyperparameters which proved more stable """
        self.buffer_size = 400000
        self.memory.enlarge(self.buffer_size)
        self.noise_sigma = 0.05
        self.noise.sigma = 0.05

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            self.gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        # print('learn : Next States : ',next_states.shape)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # print('learn : Actions : ',actions_next.shape)
        # print('learn : Q_target_next : ',Q_targets_next.shape)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def save_agent(self,i_episode):
        filename = 'trained_reacher_e'+str(i_episode)+'.pth'
        torch.save({
            'critic_local': self.critic_local.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'actor_local': self.actor_local.state_dict(),
            'actor_target': self.actor_target.state_dict(),
            }, filename)
        
        print('Saved Networks in ',filename)
        return
        
    def load_agent(self,filename):
        savedata = torch.load(filename)
        self.critic_local.load_state_dict(savedata['critic_local'])
        self.critic_target.load_state_dict(savedata['critic_target'])
        self.actor_local.load_state_dict(savedata['actor_local'])
        self.actor_target.load_state_dict(savedata['actor_target'])
        return
Пример #27
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 memory=None,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weigth_decay=WEIGHT_DECAY,
                 pretrained_actor_weights=None,
                 pretrained_critic_weights=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weigth_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        if pretrained_actor_weights:
            actor_weights = torch.load(pretrained_actor_weights)
            self.actor_local.load_state_dict(actor_weights)
            self.actor_target.load_state_dict(actor_weights)

        if pretrained_critic_weights:
            critic_weights = torch.load(pretrained_critic_weights)
            self.critic_local.load_state_dict(critic_weights)
            self.critic_target.load_state_dict(critic_weights)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        if memory:
            self.memory = memory
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device).unsqueeze(0)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #28
0
class Agent():
    "Single agent no learning algorithm"

    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0):
        """Initialize an Agent object

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            lr_actor (float) : learning rate actor network
            lr_critic (float) : learning rate critic network
            weight_decay (float) : weight decay regularizer
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.noise = OUNoise(action_size, random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

    def act(self, state, add_noise=True):
        "Returns actions for given state as per current policy"
        if not isinstance(state, torch.Tensor):
            state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def load(self, filename, map_location=None):
        "Load weights for actor and critic"
        weights = torch.load(filename, map_location=map_location)
        self.actor_local.load_state_dict(weights['actor'])
        if 'critic' in weights:
            self.critic_local.load_state_dict(weights['critic'])

    def reset(self):
        self.noise.reset()

    def save(self, filename='checkpoint.pth'):
        "Serialize actor and critic weights"
        checkpoint = {
            'actor': self.actor_local.state_dict(),
            'critic': self.critic_local.state_dict()
        }
        torch.save(checkpoint, filename)
Пример #29
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False):
        
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = discrete
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_w':args.init_w
        }
        self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = use_cuda
        # 
        if self.use_cuda: self.cuda()
        
    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # state_batch, action_batch, reward_batch, \
        # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ])
        
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        if train_actor == True:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        print("use cuda")
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1):
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
        # print(self.random_process.sample(), action)
        noise_level = noise_level * max(self.epsilon, 0)
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level)
        action = np.clip(action, -1., 1.)
        # print(action)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(
            torch.load('{}/actor.pkl'.format(output))
        )

        self.critic.load_state_dict(
            torch.load('{}/critic.pkl'.format(output))
        )


    def save_model(self, output):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor.pkl'.format(output)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic.pkl'.format(output)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()

    def seed(self,s):
        torch.manual_seed(s)
        if self.use_cuda:
            torch.cuda.manual_seed(s)
class Multi_Agents():
    """
    Implements interactions and learning on environments for a set of agents
    """
    def __init__(self, agents_count, state_size, action_size, random_seed,
                 buffer_size, batch_size, gamma, fc1_units, fc2_units, noise,
                 lr_actor, lr_critic):
        """Initialize a Multi_Agent.

        Params
        ======
            agents_count (int): the number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            buffer_size(int): replay buffer size
            gamma(float): discount factor
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
            noise(Object): The noise applied to the actions selection
            lr_actor(float) : learning rates of the actor
            lr_critic(float) : learning rates of the critic
        """

        self.agents_count = agents_count
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.gamma = gamma
        self.batch_size = batch_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 fc1_units, fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  fc1_units, fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   fc1_units, fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    fc1_units, fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=WEIGHT_DECAY)

        # after reading implementating of ShangtongZhang as suggested in the course,
        # It seems relevant to initialize the weights of the target networks
        # with the same values as the local network :
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Noise process
        self.noise = noise

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        for a in range(self.agents_count):
            # save for each agent
            self.memory.add(states[a], actions[a], rewards[a], next_states[a],
                            dones[a])

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, states, add_noise=True):
        """Returns actions for each given state of each agent as per current policy."""

        states = torch.from_numpy(states).float().to(device)
        actions = np.empty([self.agents_count, self.action_size])

        self.actor_local.eval()
        with torch.no_grad():
            for a in range(self.agents_count):
                actions[a] = self.actor_local(states[a]).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # as suggested in the "Benchmak implementation" section of the course"
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #31
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, seed, memory, batch_size, lr_actor, lr_critic, clip_critic, gamma, tau, weight_decay, update_network_steps, sgd_epoch, checkpoint_prefix):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            memory (ReplayBuffer): The replay buffer for storing xperiences
            batch_size (int): Number of experiences to sample from the memory
            lr_actor (float): The learning rate for the actor
            lr_critic (float): The learning rate critic
            clip_critic (float): The clip value for updating grads
            gamma (float): The reward discount factor
            tau (float): For soft update of target parameters
            weight_decay (float): The weight decay
            update_network_steps (int): How often to update the network
            sgd_epoch (int): Number of iterations for each network update
            checkpoint_prefix (string): The string prefix for saving checkpoint files
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.memory = memory
        self.batch_size = batch_size
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.clip_critic = clip_critic
        self.gamma = gamma
        self.tau = tau
        self.weight_decay = weight_decay
        self.update_network_steps = update_network_steps
        self.sgd_epoch = sgd_epoch
        self.n_step = 0
        
        # checkpoint
        self.checkpoint_prefix = checkpoint_prefix
        self.actor_loss_episodes = []
        self.critic_loss_episodes = []
        self.actor_loss = 0
        self.critic_loss = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, seed)

    def step(self, state, action, action_prob, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(len(state)):
            self.memory.add(state[i], action[i], action_prob[i], reward[i], next_state[i], done[i])
         
        # learn every n steps
        self.n_step = (self.n_step + 1) % self.update_network_steps
        if self.n_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > self.batch_size:
                for i in range(self.sgd_epoch):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1), np.zeros_like(action) # N/A action prob for DDPG

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, action_probs, rewards, next_states, dones = experiences

        # normalize rewards
        rewards = utils.normalize_rewards(rewards)
        
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_loss = critic_loss
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if self.clip_critic > 0:
            torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), self.clip_critic)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_loss = actor_loss
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def checkpoint(self):
        """Save internal information in memory for later checkpointing"""
        self.actor_loss_episodes.append(self.actor_loss)
        self.critic_loss_episodes.append(self.critic_loss)

    def save_checkpoint(self):
        """Persist checkpoint information"""
        # the history loss
        utils.plot_scores(self.checkpoint_prefix + "_actor_loss.png", self.actor_loss_episodes, label="loss")
        utils.plot_scores(self.checkpoint_prefix + "_critic_loss.png", self.critic_loss_episodes, label="loss")
        
        # network
        torch.save(self.actor_local.state_dict(), self.checkpoint_prefix + "_actor.pth")
        torch.save(self.critic_local.state_dict(), self.checkpoint_prefix + "_critic.pth")

    def load_checkpoint(self):
        """Restore checkpoint information"""
        self.actor_local.load_state_dict(torch.load(self.checkpoint_prefix + "_actor.pth"))
        self.critic_local.load_state_dict(torch.load(self.checkpoint_prefix + "_critic.pth"))
Пример #32
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 agents=2,
                 every=4,
                 updates=4):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(np.random.seed(random_seed))
        self.agents = agents
        self.every = every
        self.updates = updates
        self.steps = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noises = OUNoise((agents, action_size))

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device)

    def load_actor(self, model_file: str):
        self.actor_local.load_state_dict(
            torch.load(model_file, map_location=device))
        self.actor_local.to(device)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.steps += 1
        for i in range(self.agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and self.steps % self.every == 0:
            self.steps = 0
            for _ in range(self.updates):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noises.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noises.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  # Gradient clipping
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #33
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0
        if self.pic:
            self.nb_status = args.pic_status

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn,
            'init_method': args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)

        #Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array(
                [self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target(
                [next_state_batch,
                 self.actor_target(next_state_batch)])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic(
                [to_tensor(state_batch),
                 to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([state_batch, self.actor(state_batch)])
        else:
            policy_loss = -self.critic(
                [to_tensor(state_batch),
                 self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(
                    np.mean([
                        np.linalg.norm(p.grad.data.cpu().numpy().ravel())
                        for p in self.actor.parameters()
                    ]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad',
                                       mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if (self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if (self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
        if self.pic:
            action = np.concatenate(
                (softmax(action[:16]), softmax(action[16:])))
        return action

    def select_action(self,
                      s_t,
                      decay_epsilon=True,
                      return_fix=False,
                      noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(self.actor_target(s_t)).squeeze(0)
        else:
            action = to_numpy(self.actor(to_tensor(np.array([s_t
                                                             ])))).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = (action +
                      self.random_action(fix=True)) / 2.  # episilon greedy

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action

        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
Пример #34
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)