def worker_rollout(ps, replay_buffer, opt, worker_index): # env = gym.make(opt.env_name) env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) agent = Actor(opt, job="worker") keys = agent.get_weights()[0] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # epochs = opt.total_epochs // opt.num_workers total_steps = opt.steps_per_epoch * opt.total_epochs weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) # TODO opt.start_steps # for t in range(total_steps): t = 0 while True: if t > opt.start_steps: a = agent.get_action(o) else: a = env.action_space.sample() t += 1 # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == opt.max_ep_len else d # Store experience to replay buffer replay_buffer.store.remote(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == opt.max_ep_len): sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote()) while sample_times > 0 and steps / sample_times > opt.a_l_ratio: sample_times, steps, _ = ray.get( replay_buffer.get_counts.remote()) time.sleep(0.1) # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] np.random.seed() ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(ROOT) from trading_env import TradingEnv, FrameStack # ------ env set up ------ # env = gym.make(opt.env_name) env = TradingEnv(action_scheme_id=3, obs_dim=38) while True: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) # for a_l_ratio control np.random.seed() rand_buff = np.random.choice(opt.num_buffers, 1)[0] last_learner_steps, last_actor_steps, _size = ray.get( replay_buffer[rand_buff].get_counts.remote()) while True: # don't need to random sample action if load weights from local. if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover: a = agent.get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d np.random.seed() rand_buff = np.random.choice(opt.num_buffers, 1)[0] replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index) o = o2 # End of episode. Training (ep_len times). # if d or (ep_len * opt.action_repeat >= opt.max_ep_len): if d: break
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] filling_steps = 0 while True: # ------ env set up ------ env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) # ------ env set up end ------ ################################## deques o_queue = deque([], maxlen=opt.Ln + 1) a_r_d_queue = deque([], maxlen=opt.Ln) ################################## deques o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o, )) else: o_queue.append((o, )) ################################## deques reset weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) while True: # don't need to random sample action if load weights from local. if filling_steps > opt.start_steps or opt.weights_file: a = agent.get_action(o, deterministic=False) else: a = env.action_space.sample() filling_steps += 1 # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d o = o2 #################################### deques store a_r_d_queue.append(( a, r, d, )) if opt.model == "cnn": compressed_o2 = pack(o2) o_queue.append((compressed_o2, )) else: o_queue.append((o2, )) # scheme 1: # TODO and t_queue % 2 == 0: %1 lead to q smaller # TODO if t_queue >= opt.Ln and t_queue % opt.save_freq == 0: replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote( o_queue, a_r_d_queue, worker_index) t_queue += 1 #################################### deques store # End of episode. Training (ep_len times). if d or (ep_len * opt.action_repeat >= opt.max_ep_len): # TODO sample_times, steps, _ = ray.get( replay_buffer[0].get_counts.remote()) print('rollout_ep_len:', ep_len * opt.action_repeat, 'rollout_ep_ret:', ep_ret) if steps > opt.start_steps: # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o, )) else: o_queue.append((o, ))
opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers, FLAGS.a_l_ratio) agent = Actor(opt, job="main") keys, weights = agent.get_weights() pickle_in = open("weights.pickle", "rb") weights = pickle.load(pickle_in) weights = [weights[key] for key in keys] agent.set_weights(keys, weights) test_env = gym.make(opt.env_name) n = 2 rew = [] for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == opt.max_ep_len)): # Take deterministic actions at test time test_env.render() action = agent.get_action(o, True) print(action) o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 rew.append(ep_ret) print("test_reward:", sum(rew) / n)
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] np.random.seed() rand_buff1 = np.random.choice(opt.num_buffers, 1)[0] random_steps = 0 while True: # ------ env set up ------ env = TradingEnv() # env = Wrapper(env, opt.action_repeat, opt.reward_scale) # ------ env set up end ------ o_queue = deque([], maxlen=opt.Ln + 1) a_r_d_queue = deque([], maxlen=opt.Ln) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ep_score, ep_target_bias = 0, 0 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o,)) else: o_queue.append((o,)) t_queue = 1 weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) # for a_l_ratio control np.random.seed() rand_buff = np.random.choice(opt.num_buffers, 1)[0] last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) while True: # don't need to random sample action if load weights from local. if random_steps > opt.start_steps or opt.weights_file or opt.recover: a = agent.get_action(o, deterministic=False) else: a = env.action_space.sample() random_steps += 1 # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_score += info['score'] ep_target_bias += info['target_bias'] ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d o = o2 a_r_d_queue.append((a, r, d,)) if opt.model == "cnn": compressed_o2 = pack(o2) o_queue.append((compressed_o2,)) else: o_queue.append((o2,)) # scheme 1: # TODO and t_queue % 2 == 0: %1 lead to q smaller # TODO if t_queue >= opt.Ln and t_queue % opt.save_freq == 0: replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index) t_queue += 1 # End of episode. Training (ep_len times). # if d or (ep_len * opt.action_repeat >= opt.max_ep_len): if d or ep_len > opt.max_ep_len: sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote()) # print('rollout ep_len:', ep_len * opt.action_repeat, 'ep_score:', ep_score, # 'ep_target_bias:', ep_target_bias) if steps > opt.start_steps: # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o,)) else: o_queue.append((o,)) # for a_l_ratio control learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) while (actor_steps - last_actor_steps) / ( learner_steps - last_learner_steps + 1) > opt.a_l_ratio and last_learner_steps > 0: time.sleep(1) learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())