def test_vectorized_discount(self): state = th.randn(TIME_STEPS, NUM_ENVS, VECTOR_SIZE) action = th.randn(TIME_STEPS, NUM_ENVS) reward = th.randn(TIME_STEPS, NUM_ENVS) boostrap = th.randn(NUM_ENVS) done = th.zeros_like(reward) for i in list(reversed(range(TIME_STEPS)))[:4]: done[i, i % NUM_ENVS] = 1 # Computing the discounted rewards # as non-vectorized environment nonvec_discounted_rewards = [] for i in range(NUM_ENVS): replay = ch.ExperienceReplay() for t in range(TIME_STEPS): replay.append(state[t, i, :], action[t, i], reward[t, i], state[t, i, :], done[t, i]) nonvec_discounted_rewards.append( ch.td.discount(GAMMA, replay.reward(), replay.done(), boostrap[i])) # Computing the discounted rewards # as vectorized environment replay = ch.ExperienceReplay() for t in range(TIME_STEPS): replay.append(state[t, :, :], action[t, :], reward[t, :], state[t, :, :], done[t, :]) vec_discounted_rewards = ch.td.discount(GAMMA, replay.reward(), replay.done(), boostrap) for i in range(NUM_ENVS): assert th.all( nonvec_discounted_rewards[i][:, 0] == vec_discounted_rewards[:, i], )
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(replay.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss(new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
def main(env): env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.ActionLambda(env, convert_discrete_to_continuous_action) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() agent = DQN(HIDDEN_SIZE, ACTION_DISCRETISATION) target_agent = create_target_network(agent) optimiser = optim.Adam(agent.parameters(), lr=LEARNING_RATE) def get_random_action(state): action = torch.tensor([[random.randint(0, ACTION_DISCRETISATION - 1)]]) return action def get_action(state): # Original sampling (for unit test) #if random.random() < EPSILON: # action = torch.tensor([[random.randint(0, ACTION_DISCRETISATION - 1)]]) #else: # action = agent(state)[1].argmax(dim=1, keepdim=True) #return action return agent(state)[0] for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: # Randomly sample a batch of experience batch = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(batch) # Compute targets target_values = target_agent(batch.next_state())[1].max( dim=1, keepdim=True)[0] target_values = batch.reward() + DISCOUNT * ( 1 - batch.done()) * target_values # Update Q-function by one step of gradient descent pred_values = agent(batch.state())[1].gather(1, batch.action()) value_loss = F.mse_loss(pred_values, target_values) optimiser.zero_grad() value_loss.backward() optimiser.step() if step > UPDATE_START and step % TARGET_UPDATE_INTERVAL == 0: # Update target network target_agent = create_target_network(agent)
def flatten_episodes(replay, episodes, num_workers): """ TODO: This implementation is not efficient. NOTE: Additional info (other than a transition's default fields) is simply copied. To know from which worker the data was gathered, you can access sars.runner_id TODO: This is not great. What is the best behaviour with infos here ? """ flat_replay = ch.ExperienceReplay() worker_replays = [ch.ExperienceReplay() for w in range(num_workers)] flat_episodes = 0 for sars in replay: state = sars.state.view(_min_size(sars.state)) action = sars.action.view(_min_size(sars.action)) reward = sars.reward.view(_min_size(sars.reward)) next_state = sars.next_state.view(_min_size(sars.next_state)) done = sars.done.view(_min_size(sars.done)) fields = set(sars._Transition__fields) - { 'state', 'action', 'reward', 'next_state', 'done' } infos = {f: getattr(sars, f) for f in fields} for worker in range(num_workers): # Populate infos per worker worker_infos = {'runner_id': worker} for key, value in infos.items(): worker_infos[key] = value[worker] # The following attemps to split additional infos. (WIP. Remove ?) # infos = {} # for f in fields: # value = getattr(sars, f) # if isinstance(value, Iterable) and len(value) == num_workers: # value = value[worker] # elif _istensorable(value): # tvalue = ch.totensor(value) # tvalue = tvalue.view(_min_size(tvalue)) # if tvalue.size(0) == num_workers: # value = tvalue[worker] # infos[f] = value worker_replays[worker].append( state[worker], action[worker], reward[worker], next_state[worker], done[worker], **worker_infos, ) if bool(done[worker]): flat_replay += worker_replays[worker] worker_replays[worker] = ch.ExperienceReplay() flat_episodes += 1 if flat_episodes >= episodes: break if flat_episodes >= episodes: break return flat_replay
def main(env='Pendulum-v0'): env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) actor = Actor(HIDDEN_SIZE, stochastic=False, layer_norm=True) critic = Critic(HIDDEN_SIZE, state_action=True, layer_norm=True) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() get_action = lambda s: (actor(s) + ACTION_NOISE * torch.randn(1, 1)).clamp( -1, 1) for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), batch.reward(), batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
def test_save_and_load(self): old_replay = self.replay vector = np.random.rand(VECTOR_SIZE) for i in range(NUM_SAMPLES): old_replay.append(vector, vector, i, vector, False, vector=vector) # save the old file old_replay.save('testing_temp_file.pt') # load the saved file to a new file new_replay = ch.ExperienceReplay() new_replay.load('testing_temp_file.pt') # check size self.assertEqual(len(old_replay._storage), len(new_replay._storage)) self.assertEqual(len(old_replay.state()), len(new_replay.state())) self.assertEqual(len(old_replay.action()), len(new_replay.action())) self.assertEqual(len(old_replay.reward()), len(new_replay.reward())) self.assertEqual(len(old_replay.next_state()), len(new_replay.next_state())) self.assertEqual(len(old_replay.done()), len(new_replay.done())) self.assertEqual(len(old_replay.vector()), len(new_replay.vector())) # check content for a, b in zip(old_replay, new_replay): self.assertTrue(close(a.state, b.state)) self.assertTrue(close(a.action, b.action)) self.assertTrue(close(a.reward, b.reward)) self.assertTrue(close(a.next_state, b.next_state)) self.assertTrue(close(a.done, b.done)) self.assertTrue(close(a.vector, b.vector)) os.remove('testing_temp_file.pt')
def flatten_episodes(replay, episodes, num_workers, extra_info=False): """ NOTE: Additional info (other than a transition's default fields) is simply copied. To know from which worker the data was gathered, you can access sars.runner_id """ flat_replay = ch.ExperienceReplay() worker_replays = [ch.ExperienceReplay() for w in range(num_workers)] flat_episodes = 0 for sars in replay: state = sars.state.view(_min_size(sars.state)) action = sars.action.view(_min_size(sars.action)) reward = sars.reward.view(_min_size(sars.reward)) next_state = sars.next_state.view(_min_size(sars.next_state)) done = sars.done.view(_min_size(sars.done)) fields = set(sars._Transition__fields) - { 'state', 'action', 'reward', 'next_state', 'done' } infos = {f: getattr(sars, f) for f in fields} for worker in range(num_workers): # Populate infos per worker worker_infos = {'runner_id': worker} # This slightly slows down the runner! # e.g from 1.15 it/sec we go to 1.25 it/sec if extra_info: for key, value in infos.items(): worker_infos[key] = value[worker] worker_replays[worker].append( state[worker], action[worker], reward[worker], next_state[worker], done[worker], **worker_infos, ) if bool(done[worker]): flat_replay += worker_replays[worker] worker_replays[worker] = ch.ExperienceReplay() flat_episodes += 1 if flat_episodes >= episodes: break if flat_episodes >= episodes: break return flat_replay
def test_append(self): new_replay = ch.ExperienceReplay() vector = np.random.rand(VECTOR_SIZE) for i in range(NUM_SAMPLES): self.replay.append(vector, vector, i, vector, False, vector=vector) new_replay.append(vector, vector, i, vector, False, vector=vector) self.assertEqual(len(self.replay), len(new_replay)) new_replay = self.replay + new_replay self.assertEqual(NUM_SAMPLES * 2, len(new_replay)) self.replay += new_replay self.assertEqual(NUM_SAMPLES * 3, len(self.replay))
def main(env='HalfCheetahBulletEnv-v0'): random.seed(SEED) np.random.seed(SEED) th.manual_seed(SEED) env = gym.make(env) env = envs.VisdomLogger(env, interval=1000) env = envs.ActionSpaceScaler(env) env = envs.Torch(env) env = envs.Runner(env) env.seed(SEED) log_alpha = th.zeros(1, requires_grad=True) if USE_AUTOMATIC_ENTROPY_TUNING: # Heuristic target entropy target_entropy = -np.prod(env.action_space.shape).item() else: target_entropy = TARGET_ENTROPY state_size = env.state_size action_size = env.action_size policy = Policy(input_size=state_size, output_size=action_size) critic_qf1 = MLP(input_size=state_size + action_size, output_size=1) critic_qf2 = MLP(input_size=state_size + action_size, output_size=1) target_qf1 = copy.deepcopy(critic_qf1) target_qf2 = copy.deepcopy(critic_qf2) policy_opt = optim.Adam(policy.parameters(), lr=ALL_LR) qf1_opt = optim.Adam(critic_qf1.parameters(), lr=ALL_LR) qf2_opt = optim.Adam(critic_qf2.parameters(), lr=ALL_LR) alpha_opt = optim.Adam([log_alpha], lr=ALL_LR) replay = ch.ExperienceReplay() get_action = lambda state: policy(state).rsample() for step in range(TOTAL_STEPS): # Collect next step ep_replay = env.run(get_action, steps=1, render=RENDER) # Update policy replay += ep_replay replay = replay[-REPLAY_SIZE:] if len(replay) > MIN_REPLAY: update(env, replay, policy, critic_qf1, critic_qf2, target_qf1, target_qf2, log_alpha, policy_opt, qf1_opt, qf2_opt, alpha_opt, target_entropy)
def main(num_steps=10000000, env_name='PongNoFrameskip-v4', # env_name='BreakoutNoFrameskip-v4', seed=42): th.set_num_threads(1) random.seed(seed) th.manual_seed(seed) np.random.seed(seed) env = gym.make(env_name) env = envs.Logger(env, interval=1000) env = envs.OpenAIAtari(env) env = envs.Torch(env) env = envs.Runner(env) env.seed(seed) dqn = DQN(env) target_dqn = copy.deepcopy(dqn) optimizer = optim.RMSprop(dqn.parameters(), lr=LR, alpha=0.95, eps=0.01, centered=True) replay = ch.ExperienceReplay() epsilon = EPSILON get_action = lambda state: epsilon_greedy(dqn(state), epsilon) for step in range(num_steps // UPDATE_FREQ + 1): # Sample some transitions ep_replay = env.run(get_action, steps=UPDATE_FREQ) replay += ep_replay if step * UPDATE_FREQ < 1e6: # Update epsilon epsilon -= 9.9e-7 * UPDATE_FREQ if step * UPDATE_FREQ > EXPLORATION_STEPS: # Only keep the last 1M transitions replay = replay[-REPLAY_SIZE:] # Update Q-function update(replay, optimizer, dqn, target_dqn, env=env) if step % TARGET_UPDATE_FREQ == 0: target_dqn.load_state_dict(dqn.state_dict())
def run_trpo(): ch.debug.debug() for i, env_name in enumerate(sweep.SWEEP): dm_env = bsuite.load_and_record_to_csv(env_name, results_dir=TRPO_RESULTS_PATH, overwrite=True) # Instanciate the env and agent env = gym_wrapper.GymWrapper(dm_env) env = ch.envs.Torch(env) env = ch.envs.Runner(env) policy = Policy(env) baseline = LinearValue(env.state_size) # Generate the results replay = ch.ExperienceReplay() for episode in tqdm(range(1, 1 + env.bsuite_num_episodes), desc=env_name): replay += env.run(policy, episodes=1) if episode % 10 == 0: trpo_update(replay, policy, baseline) replay.empty()
number_asset, seq_window, features_all = env.observation_space.shape assert action_size == number_asset + 1 input_size = features_all - 1 net = ActorCritic(input_size=input_size, hidden_size=50, action_size=action_size) net_tgt = ActorCritic(input_size=input_size, hidden_size=50, action_size=action_size) net_tgt.eval() print(net_tgt) net_tgt.load_state_dict(net.state_dict()) # create replay replay = ch.ExperienceReplay() # create loss function criterion_mse = nn.MSELoss() # create optimizer optimizer_actor = torch.optim.Adam(net.actor.parameters(), lr=0.001) optimizer_critic = torch.optim.Adam(net.critic.parameters(), lr=0.001) def update(replay): # batch-data state_batch = replay.state() next_state_batch = replay.next_state() action_batch = replay.action() reward_batch = replay.reward()
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) result = { 'rewards': [], 'plosses': [], 'vlosses': [], 'qlosses': [], 'pweights': [], 'vweights': [], 'vweights_target': [], 'qweights1': [], 'qweights2': [], } env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() actor = SoftActor(HIDDEN_SIZE) critic_1 = Critic(HIDDEN_SIZE, state_action=True) critic_2 = Critic(HIDDEN_SIZE, state_action=True) value_critic = Critic(HIDDEN_SIZE) target_value_critic = create_target_network(value_critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=LEARNING_RATE) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) def get_random_action(state): return torch.tensor([[2 * random.random() - 1]]) def get_action(state): return actor(state).sample() for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] result['rewards'].append(replay.reward()[-1].item()) if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) # Pre-compute some quantities masses = actor(batch.state()) actions = masses.rsample() log_probs = masses.log_prob(actions) q_values = torch.min(critic_1(batch.state(), actions.detach()), critic_2(batch.state(), actions.detach())).view(-1, 1) # Compute Q losses v_next = target_value_critic(batch.next_state()).view(-1, 1) q_old_pred1 = critic_1(batch.state(), batch.action().detach()).view(-1, 1) q_old_pred2 = critic_2(batch.state(), batch.action().detach()).view(-1, 1) qloss1 = ch.algorithms.sac.action_value_loss(q_old_pred1, v_next.detach(), batch.reward(), batch.done(), DISCOUNT) qloss2 = ch.algorithms.sac.action_value_loss(q_old_pred2, v_next.detach(), batch.reward(), batch.done(), DISCOUNT) # Update Q-functions by one step of gradient descent qloss = qloss1 + qloss2 critics_optimiser.zero_grad() qloss.backward() critics_optimiser.step() result['qlosses'].append(qloss.item()) # Update V-function by one step of gradient descent v_pred = value_critic(batch.state()).view(-1, 1) vloss = ch.algorithms.sac.state_value_loss(v_pred, log_probs.detach(), q_values.detach(), alpha=ENTROPY_WEIGHT) value_critic_optimiser.zero_grad() vloss.backward() value_critic_optimiser.step() result['vlosses'].append(vloss.item()) # Update policy by one step of gradient ascent q_actions = critic_1(batch.state(), actions).view(-1, 1) policy_loss = ch.algorithms.sac.policy_loss(log_probs, q_actions, alpha=ENTROPY_WEIGHT) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['plosses'].append(policy_loss.item()) # Update target value network ch.models.polyak_average(target_value_critic, value_critic, POLYAK_FACTOR) result['pweights'] = list(actor.parameters()) result['vweights'] = list(value_critic.parameters()) result['vweights_target'] = list(target_value_critic.parameters()) result['qweights1'] = list(critic_1.parameters()) result['qweights2'] = list(critic_2.parameters()) return result
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) result = { 'rewards': [], 'plosses': [], 'vlosses': [], 'pweights': [], 'vweights': [], 'target_vweights': [], 'target_pweights': [], } env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) actor = Actor(HIDDEN_SIZE, stochastic=False, layer_norm=True) critic = Critic(HIDDEN_SIZE, state_action=True, layer_norm=True) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() def get_random_action(state): return torch.tensor([[2 * random.random() - 1]]) def get_action(state): action = actor(state) + ACTION_NOISE * torch.randn(1, 1) return torch.clamp(action, min=-1, max=1) for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) result['rewards'].append(replay.reward()[-1].item()) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), batch.reward(), batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['vlosses'].append(value_loss.item()) # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['plosses'].append(policy_loss.item()) # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR) result['pweights'] = list(actor.parameters()) result['target_pweights'] = list(target_actor.parameters()) result['vweights'] = list(critic.parameters()) result['target_vweights'] = list(target_critic.parameters()) return result
def run(self, get_action, steps=None, episodes=None, render=False): """ Runner wrapper's run method. """ if steps is None: steps = float('inf') if self.is_vectorized: self._needs_reset = True elif episodes is None: episodes = float('inf') else: msg = 'Either steps or episodes should be set.' raise Exception(msg) replay = ch.ExperienceReplay() collected_episodes = 0 collected_steps = 0 while True: if collected_steps >= steps or collected_episodes >= episodes: if self.is_vectorized and collected_episodes >= episodes: replay = flatten_episodes(replay, episodes, self.num_envs) self._needs_reset = True return replay if self._needs_reset: self.reset() info = {} action = get_action(self._current_state) if isinstance(action, tuple): skip_unpack = False if self.is_vectorized: if len(action) > 2: skip_unpack = True elif len(action) == 2 and \ self.env.num_envs == 2 and \ not isinstance(action[1], dict): # action[1] is not info but an action action = (action, ) if not skip_unpack: if len(action) == 2: info = action[1] action = action[0] elif len(action) == 1: action = action[0] else: msg = 'get_action should return 1 or 2 values.' raise NotImplementedError(msg) old_state = self._current_state state, reward, done, info = self.env.step(action) if not self.is_vectorized and done: collected_episodes += 1 self._needs_reset = True elif self.is_vectorized: collected_episodes += sum(done) # Convert tuple info dictionaries (one per worker) in a single # dictionary with a list of values for each key for each worker # e.g from this # ({key_0: value_0, key_1:value_1}, // worker_0 values # {key_0: value_0, key_1:value_1}) // worker_1 values # we get this # {key_0: [value_0, // value of worker 0 # value_0], // value of worker 1 # key_1: [value_1, // value of worker 0 # value_1], // value of worker 1} tmp_info = defaultdict(list) for info_worker in info: for key, value in info_worker.items(): # Ignore types that cannot be converted to tensors if _istensorable(value): tmp_info[key] += [value] info = tmp_info replay.append(old_state, action, reward, state, done, **info) self._current_state = state if render: self.env.render() collected_steps += 1
optimizer.zero_grad() policy_loss = th.stack(policy_loss).sum() policy_loss.backward() optimizer.step() if __name__ == '__main__': env = gym.make('CartPole-v0') env = envs.Logger(env, interval=1000) env = envs.Torch(env) env.seed(SEED) policy = PolicyNet() optimizer = optim.Adam(policy.parameters(), lr=1e-2) running_reward = 10.0 replay = ch.ExperienceReplay() for i_episode in count(1): state = env.reset() for t in range(10000): # Don't infinite loop while learning mass = Categorical(policy(state)) action = mass.sample() old_state = state state, reward, done, _ = env.step(action) replay.append( old_state, action, reward, state, done, # Cache log_prob for later
import cherry as ch # Wrap environments env = gym.make('CartPole-v0') env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) policy = PolicyNet() optimizer = optim.Adam(policy.parameters(), lr=1e-2) replay = ch.ExperienceReplay() # Manage transitions for step in range(1000): state = env.reset() while True: mass = Categorical(policy(state)) action = mass.sample() log_prob = mass.log_prob(action) next_state, reward, done, _ = env.step(action) # Build the ExperienceReplay replay.append(state, action, reward, next_state, done, log_prob=log_prob) if done: break else: state = next_state # Discounting and normalizing rewards rewards = ch.td.discount(0.99, replay.reward(), replay.done()) rewards = ch.normalize(rewards) loss = -th.sum(replay.log_prob() * rewards)
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number agent = ActorCritic(input_size=input_size, hidden_size=HIDDEN_SIZE, action_size=action_size) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() # here is to add readability new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'] new_log_probs = masses.log_prob( replay.action()).unsqueeze(-1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def get_action(state): mass, value = agent(state) action = mass.sample() log_prob = mass.log_prob(action) return action, { 'log_prob': log_prob, 'value': value, } result = { 'rewards': [], 'policy_losses': [], 'value_losses': [], 'weights': [], } for step in range(1, CHERRY_MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: for r in replay.reward(): result['rewards'].append(r.item()) with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: masses, new_values = agent(replay.state()) new_log_probs = masses.log_prob(replay.action()) new_values = new_values.view(-1, 1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['policy_losses'].append(policy_loss.item()) # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['value_losses'].append(value_loss.item()) replay.empty() result['weights'] = list(agent.parameters()) return result
def run(self, get_action, steps=None, episodes=None, render=False): """ Runner wrapper's run method. """ if steps is None: steps = float('inf') if self.is_vectorized: self._needs_reset = True elif episodes is None: episodes = float('inf') else: msg = 'Either steps or episodes should be set.' raise Exception(msg) replay = ch.ExperienceReplay() collected_episodes = 0 collected_steps = 0 while True: if collected_steps >= steps or collected_episodes >= episodes: if self.is_vectorized and collected_episodes >= episodes: replay = flatten_episodes(replay, episodes, self.num_envs) self._needs_reset = True return replay if self._needs_reset: self.reset() info = {} action = get_action(self._current_state) if isinstance(action, tuple): skip_unpack = False if self.is_vectorized: if len(action) > 2: skip_unpack = True elif len(action) == 2 and \ self.env.num_envs == 2 and \ not isinstance(action[1], dict): # action[1] is not info but an action action = (action, ) if not skip_unpack: if len(action) == 2: info = action[1] action = action[0] elif len(action) == 1: action = action[0] else: msg = 'get_action should return 1 or 2 values.' raise NotImplementedError(msg) old_state = self._current_state state, reward, done, _ = self.env.step(action) if not self.is_vectorized and done: collected_episodes += 1 self._needs_reset = True elif self.is_vectorized: collected_episodes += sum(done) replay.append(old_state, action, reward, state, done, **info) self._current_state = state if render: self.env.render() collected_steps += 1
def setUp(self): self.replay = ch.ExperienceReplay()
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def get_action(state): mass, value = agent(state) action = mass.sample() log_prob = mass.log_prob(action) return action, { 'log_prob': log_prob, 'value': value, } env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() result = { 'rewards': [], 'policy_losses': [], 'value_losses': [], 'weights': [], } for step in range(1, CHERRY_MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) > BATCH_SIZE: for r in replay.reward(): result['rewards'].append(r.item()) with torch.no_grad(): advantages = ch.pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = ch.td.discount(DISCOUNT, replay.reward(), replay.done()) # Policy loss log_probs = replay.log_prob() policy_loss = ch.algorithms.a2c.policy_loss(log_probs, advantages) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['policy_losses'].append(policy_loss.item()) # Value loss value_loss = ch.algorithms.a2c.state_value_loss(replay.value(), returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['value_losses'].append(value_loss.item()) replay.empty() result['weights'] = list(agent.parameters()) return result
def run(self, get_action, steps=None, episodes=None, render=False): """ Runner wrapper's run method. """ if steps is None: steps = float('inf') if self.is_vectorized: self._needs_reset = True elif episodes is None: episodes = float('inf') else: msg = 'Either steps or episodes should be set.' raise Exception(msg) steps = 1000 replay = ch.ExperienceReplay() collected_episodes = 0 collected_steps = 0 while True: print("collected_steps", collected_steps) if collected_steps >= steps or collected_episodes >= episodes: if self.is_vectorized and collected_episodes >= episodes: replay = flatten_episodes(replay, episodes, self.num_envs) self._needs_reset = True return replay if self._needs_reset: self.reset() info = {} action = get_action(self._current_state) print("action", action) if isinstance(action, tuple): skip_unpack = False if self.is_vectorized: if len(action) > 2: skip_unpack = True elif len(action) == 2 and \ self.env.num_envs == 2 and \ not isinstance(action[1], dict): # action[1] is not info but an action action = (action, ) if not skip_unpack: if len(action) == 2: info = action[1] action = action[0] elif len(action) == 1: action = action[0] else: msg = 'get_action should return 1 or 2 values.' raise NotImplementedError(msg) old_state = self._current_state state, reward, done, _ = self.env.step(action) #print("reward: ", reward) #print("state.shape", state.shape) #print("state", state) #state = rgb2gray(state) state = self.full_obs_to_smol_boi(state) #reward = reward.to(ptu.get_device()) #print("INNER LOOPS") #print(state) #print(reward) # print("gray.shape", gray.shape) # print("gray", gray) # if collected_steps >= 0: # collected_episodes += 1 # self._needs_reset = True if not self.is_vectorized and done: collected_episodes += 1 self._needs_reset = True elif self.is_vectorized: collected_episodes += sum(done) replay.append(old_state, action, reward, state, done, **info) self._current_state = state if render: self.env.render() collected_steps += 1
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE).to(device) agent.apply(weights_init) actor_optimizer = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimizer = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) actor_scheduler = torch.optim.lr_scheduler.StepLR(actor_optimizer, step_size=2000, gamma=0.5) critic_scheduler = torch.optim.lr_scheduler.StepLR(critic_optimizer, step_size=2000, gamma=0.5) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() def get_action(state): return agent(state.to(device)) for step in range(1, MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: #batch = replay.sample(BATCH_SIZE).to(device) batch = replay.to(device) with torch.no_grad(): advantages = pg.generalized_advantage( DISCOUNT, TRACE_DECAY, batch.reward(), batch.done(), batch.value(), torch.zeros(1).to(device)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, batch.reward(), batch.done()) old_log_probs = batch.log_prob() new_values = batch.value() new_log_probs = batch.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(batch.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(batch.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimizer.zero_grad() policy_loss.backward() #nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0) actor_optimizer.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimizer.zero_grad() value_loss.backward() #nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0) critic_optimizer.step() actor_scheduler.step() critic_scheduler.step() replay.empty()
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number actor = Actor(input_size=input_size, hidden_size=50, action_size=action_size) critic = Critic(input_size=input_size, hidden_size=50, action_size=action_size) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE_ACTOR) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE_CRITIC) replay = ch.ExperienceReplay() ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(action_size)) def get_action(state): action = actor(state) action = action + ou_noise()[0] return action def get_random_action(state): action = torch.softmax(torch.randn(action_size), dim=0) return action for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) rewards = ch.normalize(batch.reward()) #rewards = batch.reward()/100.0 change the convergency a lot value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), rewards, batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
def main(env='Pendulum-v0'): env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() actor = SoftActor(HIDDEN_SIZE) critic_1 = Critic(HIDDEN_SIZE, state_action=True) critic_2 = Critic(HIDDEN_SIZE, state_action=True) value_critic = Critic(HIDDEN_SIZE) target_value_critic = create_target_network(value_critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam( (list(critic_1.parameters()) + list(critic_2.parameters())), lr=LEARNING_RATE) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) get_action = lambda state: actor(state).sample() for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) # Pre-compute some quantities states = batch.state() rewards = batch.reward() old_actions = batch.action() dones = batch.done() masses = actor(states) actions = masses.rsample() log_probs = masses.log_prob(actions) q_values = torch.min(critic_1(states, actions.detach()), critic_2(states, actions.detach())).view(-1, 1) # Compute Q losses v_next = target_value_critic(batch.next_state()).view(-1, 1) q_old_pred1 = critic_1(states, old_actions.detach()).view(-1, 1) q_old_pred2 = critic_2(states, old_actions.detach()).view(-1, 1) qloss1 = ch.algorithms.sac.action_value_loss( q_old_pred1, v_next.detach(), rewards, dones, DISCOUNT) qloss2 = ch.algorithms.sac.action_value_loss( q_old_pred2, v_next.detach(), rewards, dones, DISCOUNT) # Update Q-functions by one step of gradient descent qloss = qloss1 + qloss2 critics_optimiser.zero_grad() qloss.backward() critics_optimiser.step() # Update V-function by one step of gradient descent v_pred = value_critic(batch.state()).view(-1, 1) vloss = ch.algorithms.sac.state_value_loss(v_pred, log_probs.detach(), q_values.detach(), alpha=ENTROPY_WEIGHT) value_critic_optimiser.zero_grad() vloss.backward() value_critic_optimiser.step() # Update policy by one step of gradient ascent q_actions = critic_1(batch.state(), actions).view(-1, 1) policy_loss = ch.algorithms.sac.policy_loss(log_probs, q_actions, alpha=ENTROPY_WEIGHT) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target value network ch.models.polyak_average(target_value_critic, value_critic, POLYAK_FACTOR)