def worker_rollout(ps, replay_buffer, opt): agent = Actor(opt, job='worker', buffer=replay_buffer) while True: weights = ray.get(ps.pull.remote()) agent.set_weights(weights) agent.run()
def worker_rollout(ps, replay_buffer, opt, worker_index): # env = gym.make(opt.env_name) env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) agent = Actor(opt, job="worker") keys = agent.get_weights()[0] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # epochs = opt.total_epochs // opt.num_workers total_steps = opt.steps_per_epoch * opt.total_epochs weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) # TODO opt.start_steps # for t in range(total_steps): t = 0 while True: if t > opt.start_steps: a = agent.get_action(o) else: a = env.action_space.sample() t += 1 # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == opt.max_ep_len else d # Store experience to replay buffer replay_buffer.store.remote(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == opt.max_ep_len): sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote()) while sample_times > 0 and steps / sample_times > opt.a_l_ratio: sample_times, steps, _ = ray.get( replay_buffer.get_counts.remote()) time.sleep(0.1) # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] np.random.seed() ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(ROOT) from trading_env import TradingEnv, FrameStack # ------ env set up ------ # env = gym.make(opt.env_name) env = TradingEnv(action_scheme_id=3, obs_dim=38) while True: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) # for a_l_ratio control np.random.seed() rand_buff = np.random.choice(opt.num_buffers, 1)[0] last_learner_steps, last_actor_steps, _size = ray.get( replay_buffer[rand_buff].get_counts.remote()) while True: # don't need to random sample action if load weights from local. if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover: a = agent.get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d np.random.seed() rand_buff = np.random.choice(opt.num_buffers, 1)[0] replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index) o = o2 # End of episode. Training (ep_len times). # if d or (ep_len * opt.action_repeat >= opt.max_ep_len): if d: break
def worker_test(ps, node_buffer, opt): agent = Actor(opt, job="test", buffer=ReplayBuffer) init_time = time.time() save_times = 0 checkpoint_times = 0 while True: weights = ray.get(ps.get_weights.remote()) agent.set_weights(weights) start_actor_step, start_learner_step, _ = get_al_status(node_buffer) start_time = time.time() agent.run() last_actor_step, last_learner_step, _ = get_al_status(node_buffer) actor_step = np.sum(last_actor_step) - np.sum(start_actor_step) learner_step = np.sum(last_learner_step) - np.sum(start_learner_step) alratio = actor_step / (learner_step + 1) update_frequency = int(learner_step / (time.time() - start_time)) total_learner_step = np.sum(last_learner_step) print("---------------------------------------------------") print("frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time))) print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step) print("actor leaner ratio: %.2f" % alratio) print("learner freq:", update_frequency) print("Ray total resources:", ray.cluster_resources()) print("available resources:", ray.available_resources()) print("---------------------------------------------------") total_time = time.time() - init_time if total_learner_step // opt.save_interval > save_times: with open(opt.save_dir + "/" + str(total_learner_step / 1e6) + "_weights.pickle", "wb") as pickle_out: pickle.dump(weights, pickle_out) print("****** Weights saved by time! ******") save_times = total_learner_step // opt.save_interval # save everything every checkpoint_freq s if total_time // opt.checkpoint_freq > checkpoint_times: print("save everything!") save_start_time = time.time() ps_save_op = [node_ps[i].save_weights.remote() for i in range(opt.num_nodes)] buffer_save_op = [node_buffer[node_index][model_type].save.remote() for model_type in model_types for node_index in range(opt.num_nodes)] ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes * 6) #5 models + ps print("total time for saving :", time.time() - save_start_time) checkpoint_times = total_time // opt.checkpoint_freq
def worker_test(ps, replay_buffer, opt): agent = Actor(opt, job="main") keys, weights = agent.get_weights() time0 = time1 = time.time() sample_times1, steps, size = ray.get(replay_buffer.get_counts.remote()) max_ret = -1000 env = gym.make(opt.env_name) while True: weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) ep_ret = agent.test(env, replay_buffer) sample_times2, steps, size = ray.get(replay_buffer.get_counts.remote()) time2 = time.time() print("test_reward:", ep_ret, "sample_times:", sample_times2, "steps:", steps, "buffer_size:", size) print('update frequency:', (sample_times2 - sample_times1) / (time2 - time1), 'total time:', time2 - time0) if ep_ret > max_ret: ps.save_weights.remote() print("****** weights saved! ******") max_ret = ep_ret time1 = time2 sample_times1 = sample_times2 # if steps >= opt.total_epochs * opt.steps_per_epoch: # exit(0) # if time2 - time0 > 30: # exit(0) time.sleep(5)
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] filling_steps = 0 while True: # ------ env set up ------ env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) # ------ env set up end ------ ################################## deques o_queue = deque([], maxlen=opt.Ln + 1) a_r_d_queue = deque([], maxlen=opt.Ln) ################################## deques o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o, )) else: o_queue.append((o, )) ################################## deques reset weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) while True: # don't need to random sample action if load weights from local. if filling_steps > opt.start_steps or opt.weights_file: a = agent.get_action(o, deterministic=False) else: a = env.action_space.sample() filling_steps += 1 # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d o = o2 #################################### deques store a_r_d_queue.append(( a, r, d, )) if opt.model == "cnn": compressed_o2 = pack(o2) o_queue.append((compressed_o2, )) else: o_queue.append((o2, )) # scheme 1: # TODO and t_queue % 2 == 0: %1 lead to q smaller # TODO if t_queue >= opt.Ln and t_queue % opt.save_freq == 0: replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote( o_queue, a_r_d_queue, worker_index) t_queue += 1 #################################### deques store # End of episode. Training (ep_len times). if d or (ep_len * opt.action_repeat >= opt.max_ep_len): # TODO sample_times, steps, _ = ray.get( replay_buffer[0].get_counts.remote()) print('rollout_ep_len:', ep_len * opt.action_repeat, 'rollout_ep_ret:', ep_ret) if steps > opt.start_steps: # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o, )) else: o_queue.append((o, ))
flags.DEFINE_string( "is_restore", "False", "True or False. True means restore weights from pickle file.") flags.DEFINE_float("a_l_ratio", 10, "steps / sample_times") opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers, FLAGS.a_l_ratio) agent = Actor(opt, job="main") keys, weights = agent.get_weights() pickle_in = open("weights.pickle", "rb") weights = pickle.load(pickle_in) weights = [weights[key] for key in keys] agent.set_weights(keys, weights) test_env = gym.make(opt.env_name) n = 2 rew = [] for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == opt.max_ep_len)): # Take deterministic actions at test time test_env.render() action = agent.get_action(o, True) print(action) o, r, d, _ = test_env.step(action) ep_ret += r
def worker_test(ps, node_buffer, opt): agent = Actor(opt, job="test") keys = agent.get_weights()[0] # test_env = gym.make(opt.env_name) ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(ROOT) from trading_env import TradingEnv, FrameStack test_env = TradingEnv(action_scheme_id=3, obs_dim=38) init_time = time.time() save_times = 0 checkpoint_times = 0 while True: # weights_all for save it to local weights_all = ray.get(ps.get_weights.remote()) weights = [weights_all[key] for key in keys] agent.set_weights(keys, weights) start_actor_step, start_learner_step, _ = get_al_status(node_buffer) start_time = time.time() ave_test_reward, ave_score = agent.test(test_env, 10) last_actor_step, last_learner_step, _ = get_al_status(node_buffer) actor_step = np.sum(last_actor_step) - np.sum(start_actor_step) learner_step = np.sum(last_learner_step) - np.sum(start_learner_step) alratio = actor_step / (learner_step + 1) update_frequency = int(learner_step / (time.time() - start_time)) total_learner_step = np.sum(last_learner_step) print("---------------------------------------------------") print("average test reward:", ave_test_reward) print("average test score:", ave_score) print( "frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time))) print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step) print("actor leaner ratio: %.2f" % alratio) print("learner freq:", update_frequency) print("Ray total resources:", ray.cluster_resources()) print("available resources:", ray.available_resources()) print("---------------------------------------------------") if learner_step < 100: alratio = 0 agent.write_tb(ave_test_reward, ave_score, alratio, update_frequency, total_learner_step) total_time = time.time() - init_time if total_learner_step // opt.save_interval > save_times: with open( opt.save_dir + "/" + str(total_learner_step / 1e6) + "M_" + str(ave_test_reward) + "_weights.pickle", "wb") as pickle_out: pickle.dump(weights_all, pickle_out) print("****** Weights saved by time! ******") save_times = total_learner_step // opt.save_interval # save everything every checkpoint_freq s if total_time // opt.checkpoint_freq > checkpoint_times: print("save everything!") save_start_time = time.time() ps_save_op = [ node_ps[i].save_weights.remote() for i in range(opt.num_nodes) ] buffer_save_op = [ node_buffer[node_index][i].save.remote() for i in range(opt.num_buffers) for node_index in range(opt.num_nodes) ] ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes * opt.num_buffers + 1) print("total time for saving :", time.time() - save_start_time) checkpoint_times = total_time // opt.checkpoint_freq
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] np.random.seed() rand_buff1 = np.random.choice(opt.num_buffers, 1)[0] random_steps = 0 while True: # ------ env set up ------ env = TradingEnv() # env = Wrapper(env, opt.action_repeat, opt.reward_scale) # ------ env set up end ------ o_queue = deque([], maxlen=opt.Ln + 1) a_r_d_queue = deque([], maxlen=opt.Ln) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ep_score, ep_target_bias = 0, 0 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o,)) else: o_queue.append((o,)) t_queue = 1 weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) # for a_l_ratio control np.random.seed() rand_buff = np.random.choice(opt.num_buffers, 1)[0] last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) while True: # don't need to random sample action if load weights from local. if random_steps > opt.start_steps or opt.weights_file or opt.recover: a = agent.get_action(o, deterministic=False) else: a = env.action_space.sample() random_steps += 1 # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_score += info['score'] ep_target_bias += info['target_bias'] ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d o = o2 a_r_d_queue.append((a, r, d,)) if opt.model == "cnn": compressed_o2 = pack(o2) o_queue.append((compressed_o2,)) else: o_queue.append((o2,)) # scheme 1: # TODO and t_queue % 2 == 0: %1 lead to q smaller # TODO if t_queue >= opt.Ln and t_queue % opt.save_freq == 0: replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index) t_queue += 1 # End of episode. Training (ep_len times). # if d or (ep_len * opt.action_repeat >= opt.max_ep_len): if d or ep_len > opt.max_ep_len: sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote()) # print('rollout ep_len:', ep_len * opt.action_repeat, 'ep_score:', ep_score, # 'ep_target_bias:', ep_target_bias) if steps > opt.start_steps: # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o,)) else: o_queue.append((o,)) # for a_l_ratio control learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) while (actor_steps - last_actor_steps) / ( learner_steps - last_learner_steps + 1) > opt.a_l_ratio and last_learner_steps > 0: time.sleep(1) learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())