Пример #1
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.env = env        
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau       

        # initialize networks 
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device)
        
        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(param)
            
        # initialize optimizers 
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        self.replay_buffer = Buffer(buffer_maxlen)
Пример #2
0
class DecoupledA3CAgent:
    def __init__(self, env, gamma, lr, global_max_episode):
        self.env = env

        self.gamma = gamma
        self.lr = lr
        self.global_episode = mp.Value('i', 0)
        self.GLOBAL_MAX_EPISODE = global_max_episode

        self.global_value_network = ValueNetwork(
            self.env.observation_space.shape[0], 1)
        self.global_value_network.share_memory()
        self.global_policy_network = PolicyNetwork(
            self.env.observation_space.shape[0], self.env.action_space.n)
        self.global_policy_network.share_memory()
        self.global_value_optimizer = optim.Adam(
            self.global_value_network.parameters(), lr=lr)
        self.global_policy_optimizer = optim.Adam(
            self.global_policy_network.parameters(), lr=lr)

        self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\
             self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())]

    def train(self):
        print("Training on {} cores".format(mp.cpu_count()))
        input("Enter to start")

        [worker.start() for worker in self.workers]
        [worker.join() for worker in self.workers]

    def save_model(self):
        torch.save(self.global_value_network.state_dict(),
                   "a3c_value_model.pth")
        torch.save(self.global_policy_network.state_dict(),
                   "a3c_policy_model.pth")
Пример #3
0
    def __init__(self, id, env, gamma, global_value_network,
                 global_policy_network, global_value_optimizer,
                 global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE):
        super(DecoupledWorker, self).__init__()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.name = "w%i" % id

        self.env = env
        self.env.seed(id)
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.local_value_network = ValueNetwork(self.obs_dim, 1)
        self.local_policy_network = PolicyNetwork(self.obs_dim,
                                                  self.action_dim)

        self.global_value_network = global_value_network
        self.global_policy_network = global_policy_network
        self.global_episode = global_episode
        self.global_value_optimizer = global_value_optimizer
        self.global_policy_optimizer = global_policy_optimizer
        self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE

        # sync local networks with global networks
        self.sync_with_global()
    def __init__(self):

        self.policy = PolicyNetwork(action_space=self.ACTION_SPACE)

        self.value_network = ValueNetwork()

        self.env = gym.make(self.ENV_ID)

        self.global_steps = 0

        self.history = []

        self.hiscore = None
Пример #5
0
def main(args):
    env = gym.make(args.env_name)
    device = torch.device(args.device)

    # 1.Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # 2.Create actor, critic, EnvSampler() and TRPO.
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    actor = PolicyNetwork(state_size,
                          action_size,
                          hidden_sizes=args.hidden_sizes,
                          init_std=args.init_std)
    critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes)
    env_sampler = EnvSampler(env, args.max_episode_step)
    trpo = TRPO(actor, critic, args.value_lr, args.value_steps_per_update,
                args.cg_steps, args.linesearch_steps, args.gamma, args.tau,
                args.damping, args.max_kl, device)

    def get_action(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = actor.select_action(state)
        return action.detach().cpu().numpy()[0]

    total_step = 0
    for episode in range(1, args.episodes + 1):
        episode_reward, samples = env_sampler(get_action, args.batch_size)
        actor_loss, value_loss = trpo.update(*samples)
        yield episode * args.batch_size, episode_reward, actor_loss, value_loss
Пример #6
0
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)
Пример #7
0
    def __init__(self, env, gamma, lr, global_max_episode):
        self.env = env

        self.gamma = gamma
        self.lr = lr
        self.global_episode = mp.Value('i', 0)
        self.GLOBAL_MAX_EPISODE = global_max_episode

        self.global_value_network = ValueNetwork(
            self.env.observation_space.shape[0], 1)
        self.global_policy_network = PolicyNetwork(
            self.env.observation_space.shape[0], self.env.action_space.n)
        self.global_value_optimizer = optim.Adam(
            self.global_value_network.parameters(), lr=lr)
        self.global_policy_optimizer = optim.Adam(
            self.global_policy_network.parameters(), lr=lr)

        self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\
             self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())]
Пример #8
0
def train_reinforce(args):
    '''
    Parse arguments and construct objects for training reinforce model, with no baseine
    '''
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    token_tables = op.build_token_tables()

    # initialize tensorboard for logging output
    from os import path
    train_logger = None
    if args.log_dir is not None:
        train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'),
                                        flush_secs=1)

    # Load Models
    policy = RobustFill(string_size=len(op.CHARACTER),
                        string_embedding_size=args.embedding_size,
                        decoder_inp_size=args.embedding_size,
                        hidden_size=args.hidden_size,
                        program_size=len(token_tables.op_token_table),
                        device=device)
    value = ValueNetwork(args.embedding_size, args.hidden_size).to(device)
    if args.continue_training_policy:
        policy.load_state_dict(
            torch.load(path.join(path.dirname(path.abspath(__file__)),
                                 args.checkpoint_filename),
                       map_location=device))
    elif args.continue_training:
        policy.load_state_dict(
            torch.load(path.join(path.dirname(path.abspath(__file__)),
                                 args.checkpoint_filename),
                       map_location=device))
        value.load_state_dict(
            torch.load(path.join(path.dirname(path.abspath(__file__)),
                                 args.val_checkpoint_filename),
                       map_location=device))
    policy = policy.to(device)
    value = value.to(device)
    # Initialize Optimizer
    if (args.optimizer == 'sgd'):
        pol_opt = optim.SGD(policy.parameters(), lr=args.lr)
        val_opt = optim.SGD(value.parameters(), lr=args.lr)
    else:
        pol_opt = optim.Adam(policy.parameters(), lr=args.lr)
        val_opt = optim.Adam(value.parameters(), lr=args.lr)

    # Load Environment
    env = RobustFillEnv()
    train_reinforce_(
        args,
        policy=policy,
        value=value,
        pol_opt=pol_opt,
        value_opt=val_opt,
        env=env,
        train_logger=train_logger,
        checkpoint_filename=args.checkpoint_filename,
        checkpoint_step_size=args.checkpoint_step_size,
        checkpoint_print_tensors=args.print_tensors,
    )
Пример #9
0
def run_learned_baseline(discount_factors, learn_rates, hidden_dims,
                         init_temps, stochasticity, n_runs, n_episodes):
    # learned baseline
    best_result = 0
    best_settings = dict()
    results_file = f'results/s{stochasticity}_learned_baseline.csv'
    best_settings_file = f'results/s{stochasticity}_learned_baseline_best_settings.pkl'

    with open(results_file, 'w') as f:
        f.write(
            'discount_factor,learn_rate_policy,learn_rate_value,hidden_dim_policy,hidden_dim_value,init_temp,result'
            + '\n')

    for discount_factor in discount_factors:
        for learn_rate_policy in learn_rates:
            for learn_rate_value in learn_rates:
                for hidden_dim_policy in hidden_dims:
                    for hidden_dim_value in hidden_dims:
                        for init_temp in init_temps:
                            print('#' * 30)
                            print('#' * 9 + ' NEW SEARCH ' + '#' * 9)
                            print('#' * 30)
                            print()

                            st = time()

                            # change this for learned baseline
                            print(
                                f'Search settings: baseline=run_episodes_with_learned_baseline, discount_factor={discount_factor}, learn_rate_policy={learn_rate_policy}, learn_rate_value={learn_rate_value}, hidden_dim_policy={hidden_dim_policy}, hidden_dim_value={hidden_dim_value}, init_temp={init_temp}'
                            )

                            # initialize the environment
                            env = gym.make('CartPole-v1')

                            result = 0

                            for i in range(n_runs):
                                start_time = time()

                                policy_model = PolicyNetwork(
                                    input_dim=4,
                                    hidden_dim=hidden_dim_policy,
                                    output_dim=2
                                )  # change input_ and output_dim for gridworld env
                                value_model = ValueNetwork(
                                    input_dim=4, hidden_dim=hidden_dim_value
                                )  # change input_dim for gridworld env
                                seed = 40 + i
                                set_seeds(env, seed)

                                episode_durations, _, _ = run_episodes_with_learned_baseline(
                                    policy_model, value_model, env, n_episodes,
                                    discount_factor, learn_rate_policy,
                                    learn_rate_value, init_temp, stochasticity)
                                result += np.mean(episode_durations)

                                del policy_model
                                del value_model

                                end_time = time()
                                h, m, s = get_running_time(end_time -
                                                           start_time)

                                print(
                                    f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
                                )

                            env.close()
                            result /= n_runs

                            with open(results_file, 'a') as f:
                                f.write(
                                    f'{discount_factor},{learn_rate_policy},{learn_rate_value},{hidden_dim_policy},{hidden_dim_value},{init_temp},{result}'
                                    + '\n')

                            et = time()
                            h, m, s = get_running_time(et - st)

                            print(
                                f'Done with search in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
                            )
                            print(
                                f'Average number of steps per episode: {result}'
                            )

                            if result > best_result:
                                best_result = result
                                best_settings[
                                    'discount_factor'] = discount_factor
                                best_settings[
                                    'learn_rate_policy'] = learn_rate_policy
                                best_settings[
                                    'learn_rate_value'] = learn_rate_value
                                best_settings[
                                    'hidden_dim_policy'] = hidden_dim_policy
                                best_settings[
                                    'hidden_dim_value'] = hidden_dim_value
                                best_settings['init_temp'] = init_temp
                                best_settings['result'] = best_result

                                pkl.dump(best_settings,
                                         open(best_settings_file, 'wb'))

                                print(f'New best result!: {result}')
                                print(f'New best settings!: {best_settings}')
                            print()

    print()
    print()
    print(f'Best settings after completing grid search: {best_settings}')
def run_learned_baseline(stochasticity, n_runs, n_episodes):
    # learned baseline
    dir_path = os.path.dirname(os.path.realpath(__file__))
    best_settings_file = dir_path + f'/cart_pole_parameter_search/s{stochasticity}_learned_baseline_best_settings.pkl'
    eval_file = f'cart_evals/s{stochasticity}_learned_baseline.pkl'

    with open(best_settings_file, 'rb') as pickle_file:
        best_settings = pkl.load(pickle_file)
    discount_factor = best_settings['discount_factor']
    learn_rate_policy = best_settings['learn_rate_policy']
    learn_rate_value = best_settings['learn_rate_value']
    hidden_dim_policy = best_settings['hidden_dim_policy']
    hidden_dim_value = best_settings['hidden_dim_value']
    init_temp = best_settings['init_temp']

    st = time()

    # change this for learned baseline
    print(
        f'Run settings: baseline=run_episodes_with_learned_baseline, discount_factor={discount_factor}, learn_rate_policy={learn_rate_policy}, learn_rate_value={learn_rate_value}, hidden_dim_policy={hidden_dim_policy}, hidden_dim_value={hidden_dim_value}, init_temp={init_temp}'
    )

    # initialize the environment
    env = gym.make('CartPole-v1')

    episode_durations_list = []
    reinforce_loss_list = []
    value_loss_list = []

    for i in range(n_runs):
        start_time = time()

        policy_model = PolicyNetwork(
            input_dim=4, hidden_dim=hidden_dim_policy,
            output_dim=2)  # change input_ and output_dim for gridworld env
        value_model = ValueNetwork(
            input_dim=4,
            hidden_dim=hidden_dim_value)  # change input_dim for gridworld env
        seed = 40 + i
        set_seeds(env, seed)

        episode_durations, reinforce_loss, value_loss = run_episodes_with_learned_baseline(
            policy_model, value_model, env, n_episodes, discount_factor,
            learn_rate_policy, learn_rate_value, init_temp, stochasticity)

        episode_durations_list.append(episode_durations)
        reinforce_loss_list.append(reinforce_loss)
        value_loss_list.append(value_loss)

        del policy_model
        del value_model

        end_time = time()
        h, m, s = get_running_time(end_time - start_time)

        print(
            f'Done with run {i+1}/{n_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
        )

    env.close()

    et = time()
    h, m, s = get_running_time(et - st)

    evals = {}
    evals['episode_durations'] = episode_durations_list
    evals['reinforce_loss'] = reinforce_loss_list
    evals['value_loss'] = value_loss_list

    pkl.dump(evals, open(eval_file, 'wb'))

    print(
        f'Done with run in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds'
    )
Пример #11
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.firsttime = 0

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        #self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]  #1

        self.conv_channels = 4
        self.kernel_size = (3, 3)

        self.img_size = (500, 500, 3)

        print("Diagnostics:")
        print(f"action_range: {self.action_range}")
        #print(f"obs_dim: {self.obs_dim}")
        print(f"action_dim: {self.action_dim}")

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.feature_net = FeatureExtractor(self.img_size[2],
                                            self.conv_channels,
                                            self.kernel_size).to(self.device)
        print("Feature net init'd successfully")

        input_dim = self.feature_net.get_output_size(self.img_size)
        self.input_size = input_dim[0] * input_dim[1] * input_dim[2]
        print(f"input_size: {self.input_size}")

        self.value_net = ValueNetwork(self.input_size, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.input_size,
                                             1).to(self.device)
        self.q_net1 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.input_size,
                                        self.action_dim).to(self.device)

        print("Finished initing all nets")

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        print("Finished copying targets")

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        print("Finished initing optimizers")

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        print("End of init")
class TRPOAgent:

    TRAJECTORY_SIZE = 1024

    VF_BATCHSIZE = 64

    MAX_KL = 0.01

    GAMMA = 0.99

    GAE_LAMBDA = 0.98

    ENV_ID = "Pendulum-v0"

    OBS_SPACE = 3

    ACTION_SPACE = 1

    def __init__(self):

        self.policy = PolicyNetwork(action_space=self.ACTION_SPACE)

        self.value_network = ValueNetwork()

        self.env = gym.make(self.ENV_ID)

        self.global_steps = 0

        self.history = []

        self.hiscore = None

    def play(self, n_iters):

        self.epi_reward = 0

        self.epi_steps = 0

        self.state = self.env.reset()

        for _ in range(n_iters):

            trajectory = self.generate_trajectory()

            trajectory = self.compute_advantage(trajectory)

            self.update_policy(trajectory)

            self.update_vf(trajectory)

        return self.history

    def generate_trajectory(self):
        """generate trajectory on current policy
        """

        trajectory = {
            "s":
            np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32),
            "a":
            np.zeros((self.TRAJECTORY_SIZE, self.ACTION_SPACE),
                     dtype=np.float32),
            "r":
            np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32),
            "s2":
            np.zeros((self.TRAJECTORY_SIZE, self.OBS_SPACE), dtype=np.float32),
            "done":
            np.zeros((self.TRAJECTORY_SIZE, 1), dtype=np.float32)
        }

        state = self.state

        for i in range(self.TRAJECTORY_SIZE):

            action = self.policy.sample_action(state)

            next_state, reward, done, _ = self.env.step(action)

            trajectory["s"][i] = state

            trajectory["a"][i] = action

            trajectory["r"][i] = reward

            trajectory["s2"][i] = next_state

            trajectory["done"][i] = done

            self.epi_reward += reward

            self.epi_steps += 1

            self.global_steps += 1

            if done:
                state = self.env.reset()

                self.history.append(self.epi_reward)

                recent_score = sum(self.history[-10:]) / 10

                print("====" * 5)
                print("Episode:", len(self.history))
                print("Episode reward:", self.epi_reward)
                print("Global steps:", self.global_steps)

                if len(self.history) > 100 and (self.hiscore is None or
                                                recent_score > self.hiscore):
                    print("*HISCORE UPDATED:", recent_score)
                    self.save_model()
                    self.hiscore = recent_score

                self.epi_reward = 0

                self.epi_steps = 0

            else:
                state = next_state

        self.state = state

        return trajectory

    def compute_advantage(self, trajectory):
        """Compute

        Args:
            trajectory ([type]): [description]
        """

        trajectory["vpred"] = self.value_network(trajectory["s"]).numpy()

        trajectory["vpred_next"] = self.value_network(trajectory["s2"]).numpy()

        is_nonterminals = 1 - trajectory["done"]

        deltas = trajectory["r"] + self.GAMMA * is_nonterminals * trajectory[
            "vpred_next"] - trajectory["vpred"]

        advantages = np.zeros_like(deltas, dtype=np.float32)

        lastgae = 0
        for i in reversed(range(len(deltas))):
            lastgae = deltas[
                i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[i] * lastgae
            advantages[i] = lastgae

        trajectory["adv"] = (advantages -
                             advantages.mean()) / (advantages.std() + 1e-8)
        #trajectory["adv"] = advantages

        trajectory["vftarget"] = trajectory["adv"] + trajectory["vpred"]

        return trajectory

    def update_policy(self, trajectory):
        def flattengrads(grads):
            flatgrads_list = [
                tf.reshape(grad, shape=[1, -1]) for grad in grads
            ]
            flatgrads = tf.concat(flatgrads_list, axis=1)
            return flatgrads

        actions = tf.convert_to_tensor(trajectory["a"], dtype=tf.float32)
        states = tf.convert_to_tensor(trajectory["s"], dtype=tf.float32)
        advantages = tf.convert_to_tensor(trajectory["adv"], dtype=tf.float32)

        old_means, old_stdevs = self.policy(states)
        old_logp = compute_logprob(old_means, old_stdevs, actions)

        with tf.GradientTape() as tape:
            new_means, new_stdevs = self.policy(states)
            new_logp = compute_logprob(new_means, new_stdevs, actions)

            loss = tf.exp(new_logp - old_logp) * advantages
            loss = tf.reduce_mean(loss)

        g = tape.gradient(loss, self.policy.trainable_variables)
        g = tf.transpose(flattengrads(g))

        @tf.function
        def hvp_func(vector):
            """Compute hessian-vector product
            """
            with tf.GradientTape() as t2:
                with tf.GradientTape() as t1:
                    new_means, new_stdevs = self.policy(states)
                    kl = compute_kl(old_means, old_stdevs, new_means,
                                    new_stdevs)
                    meankl = tf.reduce_mean(kl)

                kl_grads = t1.gradient(meankl, self.policy.trainable_variables)
                kl_grads = flattengrads(kl_grads)
                grads_vector_product = tf.matmul(kl_grads, vector)

            hvp = t2.gradient(grads_vector_product,
                              self.policy.trainable_variables)
            hvp = tf.transpose(flattengrads(hvp))

            return hvp + vector * 1e-2  #: 共役勾配法の安定化のために微小量を加える

        step_direction = cg(hvp_func, g)

        shs = tf.matmul(tf.transpose(step_direction), hvp_func(step_direction))
        lm = tf.sqrt(2 * self.MAX_KL / shs)
        fullstep = lm * step_direction

        expected_improve = tf.matmul(tf.transpose(g), fullstep)
        fullstep = restore_shape(fullstep, self.policy.trainable_variables)

        params_old = [var.numpy() for var in self.policy.trainable_variables]
        old_loss = loss

        for stepsize in [0.5**i for i in range(10)]:
            params_new = [
                p + step * stepsize for p, step in zip(params_old, fullstep)
            ]
            self.policy.set_weights(params_new)

            new_means, new_stdevs = self.policy(states)
            new_logp = compute_logprob(new_means, new_stdevs, actions)

            new_loss = tf.reduce_mean(tf.exp(new_logp - old_logp) * advantages)
            improve = new_loss - old_loss

            kl = compute_kl(old_means, old_stdevs, new_means, new_stdevs)
            mean_kl = tf.reduce_mean(kl)

            print(f"Expected: {expected_improve} Actual: {improve}")
            print(f"KL {mean_kl}")

            if mean_kl > self.MAX_KL * 1.5:
                print("violated KL constraint. shrinking step.")
            elif improve < 0:
                print("surrogate didn't improve. shrinking step.")
            else:
                print("Stepsize OK!")
                break
        else:
            print("更新に失敗")
            self.policy.set_weights(params_old)

    def update_vf(self, trajectory):

        for _ in range(self.TRAJECTORY_SIZE // self.VF_BATCHSIZE):

            indx = np.random.choice(self.TRAJECTORY_SIZE,
                                    self.VF_BATCHSIZE,
                                    replace=True)

            with tf.GradientTape() as tape:
                vpred = self.value_network(trajectory["s"][indx])
                vtarget = trajectory["vftarget"][indx]
                loss = tf.reduce_mean(tf.square(vtarget - vpred))

            variables = self.value_network.trainable_variables
            grads = tape.gradient(loss, variables)
            self.value_network.optimizer.apply_gradients(zip(grads, variables))

    def save_model(self):

        self.policy.save_weights("checkpoints/actor")

        self.value_network.save_weights("checkpoints/critic")

        print()
        print("Model Saved")
        print()

    def load_model(self):

        self.policy.load_weights("checkpoints/actor")

        self.value_network.load_weights("checkpoints/critic")

    def test_play(self, n, monitordir, load_model=False):

        if load_model:
            self.load_model()

        if monitordir:
            env = wrappers.Monitor(gym.make(self.ENV_ID),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: ep % 1 == 0))
        else:
            env = gym.make(self.ENV_ID)

        for i in range(n):

            total_reward = 0

            steps = 0

            done = False

            state = env.reset()

            while not done:

                action = self.policy.sample_action(state)

                next_state, reward, done, _ = env.step(action)

                state = next_state

                total_reward += reward

                steps += 1

            print()
            print(f"Test Play {i}: {total_reward}")
            print(f"Steps:", steps)
            print()
Пример #13
0
class DRTRPOAgent():
    """
    DR TRPO 
    """
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        dist = logits
        probs = Categorical(dist)
        return probs.sample().cpu().detach().item()

    def compute_adv_mc(self, trajectory):
        """
        Compute the advantage of all (st,at) in trajectory.
        The advantage is estimated using MC: i.e. discounted reward sum (from trajectory) - value (from NN)
        """
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]
        value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1).to(
            self.device)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        advantages = value_targets - values
        return advantages, value_loss

    def compute_adv_td(self, state, next_state, reward):
        """
        Compute the advantage of a single (s,a) using TD: i.e. r + v(s') - v(s) - depends highly on the accuracy of NN
        """
        state = torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        reward = torch.as_tensor(reward)
        state_value = self.value_network.forward(state)
        next_state_value = self.value_network.forward(next_state)
        value_target = reward + next_state_value
        advantage = value_target - state_value
        value_loss = F.mse_loss(state_value, value_target)
        return advantage, value_loss

    def compute_policy_loss_kl(self, state, state_adv, beta):
        """
        Policy loss of DR TRPO (KL Constraint).
        """
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        pi_dist = logits
        state_adv = torch.FloatTensor(state_adv).to(self.device)
        denom = torch.sum(torch.exp(state_adv / beta) * pi_dist)
        new_pi_dist = torch.exp(state_adv / beta) * pi_dist / denom
        return F.mse_loss(pi_dist, new_pi_dist)

    def compute_policy_loss_wass(self, state, state_adv, beta):
        """
        Policy loss of DR TRPO (Wasserstein Constraint).
        """
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        pi_dist = logits
        state_adv = torch.FloatTensor(state_adv).to(self.device)
        """Find argmax_j {A(s,aj) - β*d(aj,ai)}."""
        best_j = []
        for i in range(self.action_dim):
            opt_j = 0
            opt_val = state_adv[opt_j] - beta * self.compute_distance(opt_j, i)
            for j in range(self.action_dim):
                cur_val = state_adv[j] - beta * self.compute_distance(j, i)
                if cur_val > opt_val:
                    opt_j = j
                    opt_val = cur_val
            best_j.append(opt_j)

        new_pi_dist = torch.zeros(self.action_dim)
        for j in range(self.action_dim):
            for i in range(self.action_dim):
                if j == best_j[i]:
                    new_pi_dist[j] += pi_dist[i]

        return F.mse_loss(pi_dist, new_pi_dist)

    def compute_distance(self, a1, a2):
        if a1 == a2:
            return 0
        else:
            return 1

    def update(self, value_loss, policy_loss):
        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
class A2CAgent():
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        dist = F.softmax(logits, dim=0)
        probs = Categorical(dist)

        return probs.sample().cpu().detach().item()

    def compute_loss(self, trajectory):
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = rewards.view(
            -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(
                self.device)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.policy_network.forward(states)
        dists = F.softmax(logits, dim=1)
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        advantage = value_targets - values
        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantage.detach()
        policy_loss = policy_loss.mean() - 0.001 * entropy

        return value_loss, policy_loss

    def update(self, trajectory):
        value_loss, policy_loss = self.compute_loss(trajectory)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
Пример #15
0
class A2CAgent():
    def __init__(self, env, gamma, lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.policy_network.forward(state)
        dist = logits
        probs = Categorical(dist)
        return probs.sample().cpu().detach().item()

    def compute_loss(self, trajectory, adv_method):
        """   
        When gamma is large, the NN loss does not converge, we should use MC to estimate advantage. 
        When gamma is small (i.e. 0.9), the NN loss decreases after training, we can use TD to estimate advantage. 
        """
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.policy_network.forward(states)
        dists = logits
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        # 0 for MC, 1 for TD
        if adv_method == 0:
            advantages = value_targets - values
        if adv_method == 1:
            advantages = rewards - values + self.gamma * torch.cat(
                (values[1:], torch.FloatTensor([[0]])), dim=0)

        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantages.detach()
        policy_loss = policy_loss.sum() - 0.001 * entropy

        return value_loss, policy_loss

    def update(self, trajectory, adv_method):
        value_loss, policy_loss = self.compute_loss(trajectory, adv_method)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
Пример #16
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        #delayed update for policy net and target value nets
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1
Пример #17
0
class DecoupledWorker(mp.Process):
    def __init__(self, id, env, gamma, global_value_network,
                 global_policy_network, global_value_optimizer,
                 global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE):
        super(DecoupledWorker, self).__init__()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.name = "w%i" % id

        self.env = env
        self.env.seed(id)
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.local_value_network = ValueNetwork(self.obs_dim, 1)
        self.local_policy_network = PolicyNetwork(self.obs_dim,
                                                  self.action_dim)

        self.global_value_network = global_value_network
        self.global_policy_network = global_policy_network
        self.global_episode = global_episode
        self.global_value_optimizer = global_value_optimizer
        self.global_policy_optimizer = global_policy_optimizer
        self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE

        # sync local networks with global networks
        self.sync_with_global()

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        logits = self.local_policy_network.forward(state)
        dist = F.softmax(logits, dim=0)
        probs = Categorical(dist)

        return probs.sample().cpu().detach().item()

    def compute_loss(self, trajectory):
        states = torch.FloatTensor([sars[0]
                                    for sars in trajectory]).to(self.device)
        actions = torch.LongTensor([sars[1] for sars in trajectory
                                    ]).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor([sars[2]
                                     for sars in trajectory]).to(self.device)
        next_states = torch.FloatTensor([sars[3] for sars in trajectory
                                         ]).to(self.device)
        dones = torch.FloatTensor([sars[4] for sars in trajectory
                                   ]).view(-1, 1).to(self.device)

        # compute value target
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\
             * rewards[j:]) for j in range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = rewards.view(
            -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(
                self.device)

        # compute value loss
        values = self.local_value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.local_policy_network.forward(states)
        dists = F.softmax(logits, dim=1)
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        advantage = value_targets - values
        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantage.detach()
        policy_loss = policy_loss.mean() - 0.001 * entropy

        return value_loss, policy_loss

    def update_global(self, trajectory):
        value_loss, policy_loss = self.compute_loss(trajectory)

        self.global_value_optimizer.zero_grad()
        value_loss.backward()
        # propagate local gradients to global parameters
        for local_params, global_params in zip(
                self.local_value_network.parameters(),
                self.global_value_network.parameters()):
            global_params._grad = local_params._grad
        self.global_value_optimizer.step()

        self.global_policy_optimizer.zero_grad()
        policy_loss.backward()
        # propagate local gradients to global parameters
        for local_params, global_params in zip(
                self.local_policy_network.parameters(),
                self.global_policy_network.parameters()):
            global_params._grad = local_params._grad
            #print(global_params._grad)
        self.global_policy_optimizer.step()

    def sync_with_global(self):
        self.local_value_network.load_state_dict(
            self.global_value_network.state_dict())
        self.local_policy_network.load_state_dict(
            self.global_policy_network.state_dict())

    def run(self):
        state = self.env.reset()
        trajectory = []  # [[s, a, r, s', done], [], ...]
        episode_reward = 0

        while self.global_episode.value < self.GLOBAL_MAX_EPISODE:
            action = self.get_action(state)
            next_state, reward, done, _ = self.env.step(action)
            trajectory.append([state, action, reward, next_state, done])
            episode_reward += reward

            if done:
                with self.global_episode.get_lock():
                    self.global_episode.value += 1
                print(self.name + " | episode: " +
                      str(self.global_episode.value) + " " +
                      str(episode_reward))

                self.update_global(trajectory)
                self.sync_with_global()

                trajectory = []
                episode_reward = 0
                state = self.env.reset()
            else:
                state = next_state
Пример #18
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.firsttime = 0

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        #self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]  #1

        self.conv_channels = 4
        self.kernel_size = (3, 3)

        self.img_size = (500, 500, 3)

        print("Diagnostics:")
        print(f"action_range: {self.action_range}")
        #print(f"obs_dim: {self.obs_dim}")
        print(f"action_dim: {self.action_dim}")

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.feature_net = FeatureExtractor(self.img_size[2],
                                            self.conv_channels,
                                            self.kernel_size).to(self.device)
        print("Feature net init'd successfully")

        input_dim = self.feature_net.get_output_size(self.img_size)
        self.input_size = input_dim[0] * input_dim[1] * input_dim[2]
        print(f"input_size: {self.input_size}")

        self.value_net = ValueNetwork(self.input_size, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.input_size,
                                             1).to(self.device)
        self.q_net1 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.input_size,
                                        self.action_dim).to(self.device)

        print("Finished initing all nets")

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        print("Finished copying targets")

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        print("Finished initing optimizers")

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        print("End of init")

    def get_action(self, state):
        if state.shape != self.img_size:
            print(
                f"Invalid size, expected shape {self.img_size}, got {state.shape}"
            )
            return None

        inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to(
            self.device)
        features = self.feature_net(inp)
        features = features.view(-1, self.input_size)

        mean, log_std = self.policy_net.forward(features)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # states and next states are lists of ndarrays, np.stack converts them to
        # ndarrays of shape (batch_size, height, width, num_channels)
        states = np.stack(states)
        next_states = np.stack(next_states)

        states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).permute(0, 3, 1,
                                                             2).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Process images
        features = self.feature_net(
            states)  #.contiguous() # Properly shaped due to batching
        next_features = self.feature_net(next_states)  #.contiguous()

        features = torch.reshape(features, (64, self.input_size))
        next_features = torch.reshape(next_features, (64, self.input_size))

        next_actions, next_log_pi = self.policy_net.sample(next_features)
        next_q1 = self.q_net1(next_features, next_actions)
        next_q2 = self.q_net2(next_features, next_actions)
        next_v = self.target_value_net(next_features)

        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(features)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        # q loss
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        curr_q1 = self.q_net1.forward(features, actions)
        curr_q2 = self.q_net2.forward(features, actions)
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward(retain_graph=True)
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward(retain_graph=True)
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward(retain_graph=True)
        self.q2_optimizer.step()

        # delayed update for policy network and target q networks
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(features)
            min_q = torch.min(self.q_net1.forward(features, new_actions),
                              self.q_net2.forward(features, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward(retain_graph=True)
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1