Exemplo n.º 1
0
def ReplayBuffer_Init(rep_buf_size, rep_buf_ini, env, action_space):
    """Initialize the ReplayBuffer with the size of rep_buf_ini
    :param rep_buf_ini: size of initialized replay buffer
    :param action_space: action space dimension of game
    :return:
    """
    replay_buffer = ReplayBuffer(rep_buf_size)
    while len(replay_buffer) < rep_buf_ini:

        observation = env.reset()
        done = False

        while not done:
            t_observation = trace.from_numpy(observation).float()
            # t_observation.shape: torch.Size([4, 84, 84])
            # t_observation.shape:torch.Size([1, 4, 84, 84])
            t_observation = t_observation.view(1, t_observation.shape[0],
                                               t_observation.shape[1],
                                               t_observation.shape[2])
            action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation  #

    print('Experience Replay buffer initialized')
    return replay_buffer
    pass
Exemplo n.º 2
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 model='DQN',
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 pretrained_model_file=None):

        if model not in ('DQN', 'DDQN'):
            raise ValueError('Current model supports DQN or DDQN')
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (str): currently suports DQN and DDQN
            buffer size (int): replay buffer size
            batch size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            pretrained_model_file (str): filepath to .pth file with pretrained model weights
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.model = model

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        if pretrained_model_file:
            weights = torch.load(pretrained_model_file)
            self.qnetwork_local.load_state_dict(weights)
            self.qnetwork_target.load_state_dict(weights)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemplo n.º 3
0
 def __init__(self, state_size, action_size, num_agents, random_seed):
     self.agents = [
         DDPG(state_size, action_size, num_agents, random_seed),
         DDPG(state_size, action_size, num_agents, random_seed)
     ]
     self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
     self.num_agents = num_agents
     self.state_size = state_size
     self.action_size = action_size
     self.eps = EPS_START
     self.eps_decay = 1 / (EPS_EP_END
                           )  # set decay rate based on epsilon end target
Exemplo n.º 4
0
    def __init__(self, state_size, action_size, random_seed,
                 buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE,
                 gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC, weigth_decay=WEIGHT_DECAY,
                 pretrained_actor_weights=None, pretrained_critic_weights=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weigth_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay)

        if pretrained_actor_weights:
            actor_weights = torch.load(pretrained_actor_weights)
            self.actor_local.load_state_dict(actor_weights)
            self.actor_target.load_state_dict(actor_weights)

        if pretrained_critic_weights:
            critic_weights = torch.load(pretrained_critic_weights)
            self.critic_local.load_state_dict(critic_weights)
            self.critic_target.load_state_dict(critic_weights)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed)
Exemplo n.º 5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Score tracker and learning parameters
        self.score = 0
        self.best_score = -np.inf

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.1
        self.exploration_theta = 0.3
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 200
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Exemplo n.º 6
0
 def __init__(self, state_size, action_size, num_agents, random_seed):
     """init the agent"""
     self.state_size = state_size
     self.action_size = action_size
     self.seed = random_seed
     
     # Construct Actor networks
     self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device)
     self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device)
     self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
     
     # Construct Critic networks 
     self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device)
     self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device)
     self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
     
     # noise processing
     self.noise = OUNoise((num_agents,action_size), random_seed)
     
     # Replay memory
     self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
Exemplo n.º 7
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor - 0.99
        self.tau = 0.01  # for soft update of target parameters - 0.01
        
        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.score = -np.inf
Exemplo n.º 8
0
def main():
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("use_cuda: ", use_cuda)
    print("Device: ", device)

    env = atari_wrapper.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrapper.wrap_deepmind(env,
                                      clip_rewards=False,
                                      frame_stack=True,
                                      pytorch_img=True)

    action_space = [a for a in range(env.action_space.n)]
    n_action = len(action_space)

    # DQN Model and optimizer:
    policy_model = DQNModel().to(device)
    target_model = DQNModel().to(device)
    target_model.load_state_dict(policy_model.state_dict())

    optimizer = torch.optim.RMSprop(policy_model.parameters(),
                                    lr=lr,
                                    alpha=alpha)

    # Initialize the Replay Buffer
    replay_buffer = ReplayBuffer(rep_buf_size)

    while len(replay_buffer) < rep_buf_ini:

        observation = env.reset()
        done = False

        while not done:
            with torch.no_grad():
                t_observation = torch.from_numpy(observation).float().to(
                    device)
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

    print('Experience Replay buffer initialized')

    # Use log to record the performance
    logger = logging.getLogger('dqn_Riverraid')
    logger.setLevel(logging.INFO)
    logger_handler = logging.FileHandler('./dqn_Riverraid.log')
    logger.addHandler(logger_handler)

    # Training part
    env.reset()
    score = 0
    episode_score = []
    mean_episode_score = []
    episode_true = 0
    num_frames = 0
    episode = 0
    last_100episode_score = deque(maxlen=100)

    while episode < max_episodes:

        observation = env.reset()
        done = False
        # import time
        # start=time.time()

        while not done:

            with torch.no_grad():

                t_observation = torch.from_numpy(observation).float().to(
                    device) / 255
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                epsilon = epsilon_by_frame(num_frames)
                if random.random() > epsilon:
                    q_value = policy_model(t_observation)
                    action = q_value.argmax(1).data.cpu().numpy().astype(
                        int)[0]
                else:
                    action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])
            num_frames += 1
            score += reward

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

            # Update policy
            if len(replay_buffer
                   ) > batch_size and num_frames % skip_frame == 0:
                observations, actions, rewards, next_observations, dones = replay_buffer.sample(
                    batch_size)

                observations = torch.from_numpy(np.array(observations) /
                                                255).float().to(device)

                actions = torch.from_numpy(
                    np.array(actions).astype(int)).float().to(device)
                actions = actions.view(actions.shape[0], 1)

                rewards = torch.from_numpy(
                    np.array(rewards)).float().to(device)
                rewards = rewards.view(rewards.shape[0], 1)

                next_observations = torch.from_numpy(
                    np.array(next_observations) / 255).float().to(device)

                dones = torch.from_numpy(
                    np.array(dones).astype(int)).float().to(device)
                dones = dones.view(dones.shape[0], 1)

                q_values = policy_model(observations)
                next_q_values = target_model(next_observations)

                q_value = q_values.gather(1, actions.long())
                next_q_value = next_q_values.max(1)[0].unsqueeze(1)
                expected_q_value = rewards + gamma * next_q_value * (1 - dones)

                loss = huber_loss(q_value, expected_q_value)

                optimizer.zero_grad()
                loss.backward()

                optimizer.step()

                for target_param, policy_param in zip(
                        target_model.parameters(), policy_model.parameters()):
                    target_param.data.copy_(TAU * policy_param.data +
                                            (1 - TAU) * target_param.data)

        episode += 1
        # episode_score.append(score)
        # end=time.time()
        # print("Running time ( %i episode): %.3f Seconds "%(episode ,end-start))

        if info['ale.lives'] == 0:
            # episode_score.append(score)
            mean_score = score
            episode_true += 1
            score = 0

            # if episode % 20 == 0:
            # mean_score = np.mean(episode_score)
            mean_episode_score.append(mean_score)
            last_100episode_score.append(mean_score)
            # episode_score = []
            logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                        str(episode_true) + ' / Average Score : ' +
                        str(int(mean_score)) + '   / epsilon: ' +
                        str(float(epsilon)))
            #plot_score(mean_episode_score, episode_true)
            pickle.dump(mean_episode_score,
                        open('./dqn_Riverraid_mean_scores.pickle', 'wb'))
            if episode_true % 50 == 1:
                logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                            str(episode_true) + ' / Average Score : ' +
                            str(int(mean_score)) + '   / epsilon: ' +
                            str(float(epsilon)) +
                            '   / last_100episode_score: ' +
                            str(float(np.mean(last_100episode_score))))

        if episode % 50 == 0:
            torch.save(target_model.state_dict(),
                       './dqn_spaceinvaders_target_model_state_dict.pt')
            torch.save(policy_model.state_dict(),
                       './dqn_spaceinvaders_model_state_dict.pt')

    pass
Exemplo n.º 9
0
    def _setup(self, dconfig, logdir):
        """
        Create tensorflow graph and summary writer
        :param dconfig: configuration to use to build the graph
        :param logdir: log directory to write tensorflow logs to
        """
        env = gym.make(dconfig.env_name)
        obs_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        act_limit = env.action_space.high[0]

        agent = Agent(dconfig, env)
        objective = Objective(dconfig)

        # Experience buffer
        replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                     act_dim=act_dim,
                                     size=dconfig.buffer_size,
                                     discount_factor=dconfig.discount_factor)

        time = dconfig.recurrent_time_steps if dconfig.recurrent_time_steps > 1 else None

        # Create datasets from replay buffer
        replay_buffer_dataset = replay_buffer.create_dataset(
            dconfig.buffer_sample_size, time)
        replay_buffer_dataset_iterator = replay_buffer_dataset.make_initializable_iterator(
        )

        # If we perform multiple gradient steps in the inner loop, provide different data for each step
        large_batch_size = (self.dconfig.obj_func_second_order_steps +
                            1) * dconfig.buffer_sample_size
        large_replay_buffer_dataset = replay_buffer.create_dataset(
            large_batch_size, time)
        large_replay_buffer_dataset_iterator = large_replay_buffer_dataset.make_initializable_iterator(
        )

        handle = tf.placeholder(tf.string, shape=[])
        iterator = tf.data.Iterator.from_string_handle(
            handle, replay_buffer_dataset.output_types,
            replay_buffer_dataset.output_shapes)
        itr_elem = utils.DotDict(iterator.get_next())
        x_ph, a_ph, x2_ph, r_ph, d_ph, lens_ph = itr_elem.obs1, itr_elem.acts, itr_elem.obs2,\
                                                 itr_elem.rews, itr_elem.done, itr_elem.lens

        # Mask for different trajectory lengths
        if lens_ph is not None:
            seq_mask = tf.sequence_mask(lens_ph, time, dtype=tf.float32)
        else:
            seq_mask = tf.ones([], dtype=tf.float32)

        x_ph_behv = placeholder(obs_dim, name='ObsBehavior')
        timestep = tf.placeholder(tf.float32, [], 'timestep')

        if dconfig.policy_is_recurrent:
            state_shape = [2, 1, dconfig.policy_units]
            init_policy_state = tf.placeholder_with_default(
                tf.zeros(state_shape), [2, 1, dconfig.policy_units])
        else:
            init_policy_state = None

        transition = [
            x_ph, a_ph, x2_ph, r_ph[..., tf.newaxis], d_ph[..., tf.newaxis]
        ]

        # Learning rate annealing
        if dconfig.policy_update_start:
            base = dconfig.policy_lr_annealing_base
            lr_progress = (base**tf.minimum(
                1.0, timestep / dconfig.policy_update_start) - 1) / (base - 1)
        else:
            lr_progress = 1

        # Optimizers
        pi_optimizer = utils.TensorAdamOptimizer(
            learning_rate=dconfig.policy_learning_rate * lr_progress)
        q_optimizer = tf.train.AdamOptimizer(
            learning_rate=dconfig.critic_learning_rate)
        obj_optimizer = tf.train.AdamOptimizer(
            learning_rate=dconfig.obj_func_learning_rate)

        # Main outputs from computation graph
        main = agent.main
        policy = main.policy(x_ph, seq_len=lens_ph)
        pi_action = policy.action
        q1_pi = policy.value
        pi_behv = main.policy(x_ph_behv[:, tf.newaxis],
                              initial_state=init_policy_state)
        q1 = main.critic(x_ph, a_ph)
        q2 = main.critic2(x_ph, a_ph)
        obj = objective.objective(x_ph, a_ph, transition, lens_ph, seq_mask,
                                  agent, policy)

        # Target policy network
        pi_action_targ = agent.target.policy(x2_ph, seq_len=lens_ph).action

        # Target Q networks
        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_action_targ),
                                   stddev=dconfig.critic_noise)
        epsilon = tf.clip_by_value(epsilon, -dconfig.critic_noise_clip,
                                   dconfig.critic_noise_clip)
        a2 = pi_action_targ + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)
        q1_targ = agent.target.critic(x2_ph, a2)
        q2_targ = agent.target.critic2(x2_ph, a2)

        # Bellman backup for Q functions, using Clipped Double-Q targets
        min_q_targ = tf.minimum(q1_targ, q2_targ)
        gamma = dconfig.discount_factor
        backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ +
                                  d_ph)

        # Objective function annealing
        if dconfig.obj_func_anneal_steps:
            progress = tf.minimum(1.0,
                                  timestep / dconfig.obj_func_anneal_steps)
            obj = progress * obj - (1 - progress) * q1_pi

        # TD3 losses
        pi_loss = -tf.reduce_mean(q1_pi * seq_mask)
        pi_obj_loss = tf.reduce_mean(obj * seq_mask)
        q1_loss = tf.reduce_mean((q1 - backup)**2 * seq_mask)
        q2_loss = tf.reduce_mean((q2 - backup)**2 * seq_mask)
        q_loss = q1_loss + q2_loss

        main_vars = sorted(get_vars('main', trainable_only=False),
                           key=lambda v: v.name)
        target_vars = sorted(get_vars('target', trainable_only=False),
                             key=lambda v: v.name)

        # Train policy directly using critic
        train_pi_op = self._clipped_minimize(pi_optimizer,
                                             pi_loss,
                                             get_vars('main/policy'),
                                             grad_name='ddpg_policy_grads')
        # Train policy using objective function
        train_pi_obj_op = self._clipped_minimize(
            pi_optimizer,
            pi_obj_loss,
            get_vars('main/policy'),
            grad_name='objective_policy_grads')
        # Train critic
        train_q_op = q_optimizer.minimize(q_loss,
                                          var_list=get_vars('main/critic'))
        tf.summary.histogram('policy_params',
                             utils.flat(get_vars('main/policy')))

        # Objective function loss
        q1_obj = objective.future_policy_value(
            x_ph,
            a_ph,
            transition,
            lens_ph,
            seq_mask,
            agent,
            pi_optimizer,
            create_summary=dconfig.obj_func_enabled)
        obj_loss = -tf.reduce_mean(q1_obj)

        # Objective function optimization using ray (send gradients to ObjectiveServer)
        obj_vars = get_vars('objective')
        store_socket = utils.get_store_socket()

        shapes = [v.shape for v in obj_vars]
        plasma_var_oid = tf.placeholder(shape=[],
                                        dtype=tf.string,
                                        name="plasma_var_oid")
        retrieved_vars = utils.reverse_flat(
            plasma.tf_plasma_op.plasma_to_tensor(
                plasma_var_oid,
                dtype=tf.float32,
                plasma_store_socket_name=store_socket), shapes)
        # Op to read new objective parameters from ray object store
        plasma_read_vars = [
            var.assign(retrieved)
            for var, retrieved in zip(obj_vars, retrieved_vars)
        ]

        grads, vars = zip(*obj_optimizer.compute_gradients(obj_loss, obj_vars))
        grads, _ = tf.clip_by_global_norm(grads,
                                          clip_norm=dconfig.clip_gradient)
        tf.summary.histogram('objective_params', utils.flat(vars))
        tf.summary.histogram('objective_param_grads', utils.flat(grads))
        objective_grads = grads
        # Op to send gradients to ObjectiveServer
        train_obj_op = obj_optimizer.apply_gradients(zip(
            objective_grads, vars))

        plasma_grad_oid = tf.placeholder(shape=[],
                                         dtype=tf.string,
                                         name="plasma_grad_oid")
        # Op to send gradients to ObjectiveServer
        plasma_write_grads = plasma.tf_plasma_op.tensor_to_plasma(
            [utils.flat(objective_grads)],
            plasma_grad_oid,
            plasma_store_socket_name=store_socket)

        # Print number of parameters
        print(f'''
        ===================================================================
        Parameters
        Policy {np.sum(np.prod(v.shape) for v in get_vars('main/policy'))}
        Critic {np.sum(np.prod(v.shape) for v in get_vars('main/critic'))}
        Objective {np.sum(np.prod(v.shape) for v in obj_vars)}
        ===================================================================
        ''')

        # Polyak averaging for target variables
        polyak = 1 - dconfig.target_network_update_speed
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(main_vars, target_vars)
        ])

        # Initializing target networks to match main variables
        target_init = tf.group([
            tf.assign(v_targ, v_main)
            for v_main, v_targ in zip(main_vars, target_vars)
        ])

        # Ops for copying and resetting the policy (currently not used)
        reset_policy = tf.variables_initializer(get_vars('main'))
        copy_policy = tf.group([
            tf.assign(v_targ, v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

        # Summaries
        tflog_utils.log_scalars(policy_loss=pi_loss, q_loss=q_loss)
        if dconfig.obj_func_enabled:
            tflog_utils.log_scalars(policy_obj_loss=pi_obj_loss,
                                    objective_loss=obj_loss)

        self.restore_savers = self._create_restore_savers(dconfig)
        self.saver = tf.train.Saver(max_to_keep=1000, save_relative_paths=True)
        self.summary = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(
            f'{logdir}_agent{self.worker_index}')

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())

        init_ops = [target_init]
        self.sess.run(init_ops)

        rb_handle, large_rb_handle = self.sess.run([
            replay_buffer_dataset_iterator.string_handle(),
            large_replay_buffer_dataset_iterator.string_handle()
        ])

        # Return all created tf ops
        return utils.DotDict(locals())
Exemplo n.º 10
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 model='DQN',
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 pretrained_model_file=None):

        if model not in ('DQN', 'DDQN'):
            raise ValueError('Current model supports DQN or DDQN')
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (str): currently suports DQN and DDQN
            buffer size (int): replay buffer size
            batch size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            pretrained_model_file (str): filepath to .pth file with pretrained model weights
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.model = model

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        if pretrained_model_file:
            weights = torch.load(pretrained_model_file)
            self.qnetwork_local.load_state_dict(weights)
            self.qnetwork_target.load_state_dict(weights)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        if self.model == 'DQN':
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
        if self.model == 'DDQN':
            argmax_actions = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, argmax_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Exemplo n.º 11
0
        # identity loss(L1)
        Gl2m_id_loss = criterion_identity(m_1, M)
        Gm2h_id_loss = criterion_identity(
            h_1, H) + (criterion_identity(h_2, H) * 0.5)
        identity_loss = (Gl2m_id_loss + Gm2h_id_loss * 0.67) * 0.5

        #total_loss
        loss_G = identity_loss * 0.5 + G_loss + cycle_loss * 10
        loss_G.backward()
        optimizer_G.step()
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        optimizer_Dm.zero_grad()
        #real
        Dm_loss = criterion_GAN(Dm(M), target_real)
        #fake
        m_1 = ReplayBuffer().push_and_pop(m_1)
        m_2 = ReplayBuffer().push_and_pop(m_2)
        m_3 = ReplayBuffer().push_and_pop(m_3)
        Dm_f_loss = criterion_GAN(Dm(m_1.detach()), target_fake)
        Dm_f_loss += criterion_GAN(Dm(m_2.detach()), target_fake) * 0.5
        Dm_f_loss += criterion_GAN(Dm(m_3.detach()), target_fake) * 0.25
        # Total loss
        loss_Dm = (Dm_f_loss + Dm_loss) * 0.5

        loss_Dm.backward()
        optimizer_Dm.step()
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        optimizer_Dh.zero_grad()
        #real
        Dh_loss = criterion_GAN(Dh(H), target_real)
        #fake
Exemplo n.º 12
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        # Print debug statements
        self.debug = False

        # Task (environment) information
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (policy) model
        self.actor_lr = 1e-4
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learning_rate=self.actor_lr)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learning_rate=self.actor_lr)

        # Critic (value) model
        self.critic_lr = 1e-4
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learning_rate=self.critic_lr)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learning_rate=self.critic_lr)

        # Print Actor / Critic NN architectures
        if self.debug:
            self.actor_local.model.summary()
            self.critic_local.model.summary()

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 1.5e-1
        self.exploration_sigma = 2.0e-2
        self.noise = OUNoise(self.action_size,
                             mu=self.exploration_mu,
                             theta=self.exploration_theta,
                             sigma=self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Score tracker
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        score = self.total_reward / float(
            self.count) if self.count else -np.inf
        if score > self.best_score:
            self.best_score = score
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1

        # Learn if enough samples are in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # Add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next state actions and Q values from target networks
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)
Exemplo n.º 13
0
class DDPG_agent():
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """init the agent"""
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random_seed
        
        # Construct Actor networks
        self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
        
        # Construct Critic networks 
        self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        
        # noise processing
        self.noise = OUNoise((num_agents,action_size), random_seed)
        
        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
        
    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        # convert state from numpy to pytorch array 
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        
        return np.clip(action, -1, 1)
        
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(state.shape[0]):
            self.memory.add(state[i, :], action[i], reward[i], next_state[i, :], done[i])
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        
    def reset(self):
        """ reset noise """
        self.noise.reset()
        
    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemplo n.º 14
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor - 0.99
        self.tau = 0.01  # for soft update of target parameters - 0.01
        
        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.score = -np.inf

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.total_reward += reward
        self.count += 1
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        self.score = self.total_reward / float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
            
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemplo n.º 15
0
    def __init__(self, task):
        # Print debug statements
        self.debug = False

        # Task (environment) information
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (policy) model
        self.actor_lr = 1e-4
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learning_rate=self.actor_lr)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learning_rate=self.actor_lr)

        # Critic (value) model
        self.critic_lr = 1e-4
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learning_rate=self.critic_lr)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learning_rate=self.critic_lr)

        # Print Actor / Critic NN architectures
        if self.debug:
            self.actor_local.model.summary()
            self.critic_local.model.summary()

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 1.5e-1
        self.exploration_sigma = 2.0e-2
        self.noise = OUNoise(self.action_size,
                             mu=self.exploration_mu,
                             theta=self.exploration_theta,
                             sigma=self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Score tracker
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0

        # Episode variables
        self.reset_episode()
Exemplo n.º 16
0
class MADDPG:
    def __init__(self, state_size, action_size, num_agents, random_seed):
        self.agents = [
            DDPG(state_size, action_size, num_agents, random_seed),
            DDPG(state_size, action_size, num_agents, random_seed)
        ]
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END
                              )  # set decay rate based on epsilon end target

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        actions = [
            agent.act(state, add_noise)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def target_act(self, states):
        """Returns actions for given state as per current policy."""
        actions = [
            agent.target_act(state)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        state = np.asanyarray(state)
        action = np.asanyarray(action)
        reward = np.asanyarray(reward)
        next_state = np.asanyarray(next_state)
        done = np.asanyarray(done)
        self.memory.add(state.reshape((1, self.num_agents, -1)), action.reshape((1, self.num_agents, -1)), \
                        reward.reshape((1, self.num_agents, -1)), next_state.reshape((1,self.num_agents, -1)), \
                        done.reshape((1, self.num_agents, -1)))

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            for ai in range(self.num_agents):
                experiences = self.memory.sample()
                self.learn(experiences, ai, GAMMA)

    def reset(self):
        [agent.reset() for agent in self.agents]

    def learn(self, experiences, ai, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        agent = self.agents[ai]
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_states = next_states.view(1, BATCH_SIZE, self.num_agents, -1)
        actions_next = self.target_act(next_states)
        actions_next = torch.cat(actions_next, dim=1)
        next_states = next_states.view(BATCH_SIZE, -1)
        actions_next = actions_next.view(BATCH_SIZE, -1)

        Q_targets_next = agent.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards[:, ai] + (gamma * Q_targets_next *
                                      (1 - dones[:, ai]))
        # Compute critic loss
        Q_expected = agent.critic_local(states.view(BATCH_SIZE, -1),
                                        actions.view(BATCH_SIZE, -1))
        # mean squared error loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        # zero_grad because we do not want to accumulate
        # gradients from other batches, so needs to be cleared
        agent.critic_optimizer.zero_grad()
        # compute derivatives for all variables that
        # requires_grad-True
        critic_loss.backward()
        # update those variables that requires_grad-True
        agent.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        # take the current states and predict actions
        actions_pred = agent.actor_local(states)
        #actions_pred = torch.cat(actions_pred, dim=1)
        # -1 * (maximize) Q value for the current prediction
        actor_loss = -agent.critic_local(states.view(
            BATCH_SIZE, -1), actions_pred.view(BATCH_SIZE, -1)).mean()
        # Minimize the loss
        # zero_grad because we do not want to accumulate
        # gradients from other batches, so needs to be cleared
        agent.actor_optimizer.zero_grad()
        # compute derivatives for all variables that
        # requires_grad-True
        actor_loss.backward()
        # update those variables that requires_grad-True
        agent.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(agent.critic_local, agent.critic_target, TAU)
        self.soft_update(agent.actor_local, agent.actor_target, TAU)

        # update noise decay parameter
        if self.eps >= EPS_FINAL:
            self.eps -= self.eps_decay
            self.eps = max(self.eps, EPS_FINAL)
        agent.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 17
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Score tracker and learning parameters
        self.score = 0
        self.best_score = -np.inf

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.1
        self.exploration_theta = 0.3
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 200
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.total_reward += reward
        self.count += 1
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemplo n.º 18
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 memory=None,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weigth_decay=WEIGHT_DECAY,
                 pretrained_actor_weights=None,
                 pretrained_critic_weights=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weigth_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        if pretrained_actor_weights:
            actor_weights = torch.load(pretrained_actor_weights)
            self.actor_local.load_state_dict(actor_weights)
            self.actor_target.load_state_dict(actor_weights)

        if pretrained_critic_weights:
            critic_weights = torch.load(pretrained_critic_weights)
            self.critic_local.load_state_dict(critic_weights)
            self.critic_target.load_state_dict(critic_weights)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        if memory:
            self.memory = memory
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device).unsqueeze(0)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 19
0
        Gc2d_loss = criterion_GAN(Dd(df), target_real)
        G_loss = (Gc2d_loss + Gd2c_loss) * 0.5
        #cycle_loss
        cdc_cycle_loss = criterion_cycle(cr, c)
        dcd_cycle_loss = criterion_cycle(dr, d)
        cycle_loss = (dcd_cycle_loss + cdc_cycle_loss) * 0.5
        #total_loss
        loss_G = cycle_loss * 10 + G_loss + identity_loss * 0.5
        loss_G.backward()
        optimizer_G.step()
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        optimizer_Dc.zero_grad()
        #real
        Dc_c_loss = criterion_GAN(Dc(c), target_real)
        #fake
        cf = ReplayBuffer().push_and_pop(cf)
        Dc_cf = Dc(cf.detach())
        Dc_cf_loss = criterion_GAN(Dc_cf, target_fake)
        # Total loss
        loss_Dc = (Dc_c_loss + Dc_cf_loss) * 0.5
        loss_Dc.backward()

        optimizer_Dc.step()
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        optimizer_Dd.zero_grad()
        #real
        Dd_d_loss = criterion_GAN(Dd(d), target_real)
        #fake
        df = ReplayBuffer().push_and_pop(df)
        Dd_df = Dd(df.detach())
        Dd_df_loss = criterion_GAN(Dd_df, target_fake)