Exemplo n.º 1
0
def worker_rollout(ps, replay_buffer, opt, worker_index):

    # env = gym.make(opt.env_name)

    env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise,
                  opt.reward_scale, 3)

    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]

    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # epochs = opt.total_epochs // opt.num_workers
    total_steps = opt.steps_per_epoch * opt.total_epochs

    weights = ray.get(ps.pull.remote(keys))
    agent.set_weights(keys, weights)

    # TODO opt.start_steps
    # for t in range(total_steps):
    t = 0
    while True:
        if t > opt.start_steps:
            a = agent.get_action(o)
        else:
            a = env.action_space.sample()
        t += 1
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == opt.max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store.remote(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == opt.max_ep_len):
            sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote())

            while sample_times > 0 and steps / sample_times > opt.a_l_ratio:
                sample_times, steps, _ = ray.get(
                    replay_buffer.get_counts.remote())
                time.sleep(0.1)

            # update parameters every episode
            weights = ray.get(ps.pull.remote(keys))
            agent.set_weights(keys, weights)

            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
Exemplo n.º 2
0
def worker_rollout(ps, replay_buffer, opt, worker_index):
    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]
    np.random.seed()

    ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    sys.path.append(ROOT)
    from trading_env import TradingEnv, FrameStack
    # ------ env set up ------
    # env = gym.make(opt.env_name)
    env = TradingEnv(action_scheme_id=3, obs_dim=38)

    while True:

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        # for a_l_ratio control
        np.random.seed()
        rand_buff = np.random.choice(opt.num_buffers, 1)[0]
        last_learner_steps, last_actor_steps, _size = ray.get(
            replay_buffer[rand_buff].get_counts.remote())

        while True:

            # don't need to random sample action if load weights from local.
            if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover:
                a = agent.get_action(o)
            else:
                a = env.action_space.sample()
            # Step the env
            o2, r, d, _ = env.step(a)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            np.random.seed()
            rand_buff = np.random.choice(opt.num_buffers, 1)[0]
            replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index)

            o = o2

            # End of episode. Training (ep_len times).
            # if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
            if d:
                break
Exemplo n.º 3
0
def worker_rollout(ps, replay_buffer, opt, worker_index):

    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]

    filling_steps = 0
    while True:
        # ------ env set up ------
        env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise,
                      opt.reward_scale, 3)
        # ------ env set up end ------

        ################################## deques

        o_queue = deque([], maxlen=opt.Ln + 1)
        a_r_d_queue = deque([], maxlen=opt.Ln)

        ################################## deques

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        ################################## deques reset
        t_queue = 1
        if opt.model == "cnn":
            compressed_o = pack(o)
            o_queue.append((compressed_o, ))
        else:
            o_queue.append((o, ))

        ################################## deques reset

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        while True:

            # don't need to random sample action if load weights from local.
            if filling_steps > opt.start_steps or opt.weights_file:
                a = agent.get_action(o, deterministic=False)
            else:
                a = env.action_space.sample()
                filling_steps += 1
            # Step the env
            o2, r, d, _ = env.step(a)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            o = o2

            #################################### deques store

            a_r_d_queue.append((
                a,
                r,
                d,
            ))
            if opt.model == "cnn":
                compressed_o2 = pack(o2)
                o_queue.append((compressed_o2, ))
            else:
                o_queue.append((o2, ))

            # scheme 1:
            # TODO  and t_queue % 2 == 0: %1 lead to q smaller
            # TODO
            if t_queue >= opt.Ln and t_queue % opt.save_freq == 0:
                replay_buffer[np.random.choice(opt.num_buffers,
                                               1)[0]].store.remote(
                                                   o_queue, a_r_d_queue,
                                                   worker_index)

            t_queue += 1

            #################################### deques store

            # End of episode. Training (ep_len times).
            if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
                # TODO
                sample_times, steps, _ = ray.get(
                    replay_buffer[0].get_counts.remote())

                print('rollout_ep_len:', ep_len * opt.action_repeat,
                      'rollout_ep_ret:', ep_ret)

                if steps > opt.start_steps:
                    # update parameters every episode
                    weights = ray.get(ps.pull.remote(keys))
                    agent.set_weights(keys, weights)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

                ################################## deques reset
                t_queue = 1
                if opt.model == "cnn":
                    compressed_o = pack(o)
                    o_queue.append((compressed_o, ))
                else:
                    o_queue.append((o, ))
Exemplo n.º 4
0
opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers,
                      FLAGS.a_l_ratio)

agent = Actor(opt, job="main")
keys, weights = agent.get_weights()
pickle_in = open("weights.pickle", "rb")
weights = pickle.load(pickle_in)

weights = [weights[key] for key in keys]

agent.set_weights(keys, weights)

test_env = gym.make(opt.env_name)

n = 2

rew = []
for j in range(n):
    o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
    while not (d or (ep_len == opt.max_ep_len)):
        # Take deterministic actions at test time
        test_env.render()
        action = agent.get_action(o, True)
        print(action)
        o, r, d, _ = test_env.step(action)
        ep_ret += r
        ep_len += 1
    rew.append(ep_ret)
print("test_reward:", sum(rew) / n)
Exemplo n.º 5
0
def worker_rollout(ps, replay_buffer, opt, worker_index):
    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]
    np.random.seed()
    rand_buff1 = np.random.choice(opt.num_buffers, 1)[0]

    random_steps = 0

    while True:
        # ------ env set up ------

        env = TradingEnv()
        # env = Wrapper(env, opt.action_repeat, opt.reward_scale)
        # ------ env set up end ------

        o_queue = deque([], maxlen=opt.Ln + 1)
        a_r_d_queue = deque([], maxlen=opt.Ln)

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        ep_score, ep_target_bias = 0, 0

        if opt.model == "cnn":
            compressed_o = pack(o)
            o_queue.append((compressed_o,))
        else:
            o_queue.append((o,))

        t_queue = 1

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        # for a_l_ratio control
        np.random.seed()
        rand_buff = np.random.choice(opt.num_buffers, 1)[0]
        last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())

        while True:

            # don't need to random sample action if load weights from local.
            if random_steps > opt.start_steps or opt.weights_file or opt.recover:
                a = agent.get_action(o, deterministic=False)
            else:
                a = env.action_space.sample()
                random_steps += 1
            # Step the env
            o2, r, d, info = env.step(a)

            ep_ret += r
            ep_score += info['score']
            ep_target_bias += info['target_bias']
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            o = o2

            a_r_d_queue.append((a, r, d,))
            if opt.model == "cnn":
                compressed_o2 = pack(o2)
                o_queue.append((compressed_o2,))
            else:
                o_queue.append((o2,))

            # scheme 1:
            # TODO  and t_queue % 2 == 0: %1 lead to q smaller
            # TODO
            if t_queue >= opt.Ln and t_queue % opt.save_freq == 0:
                replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index)

            t_queue += 1

            # End of episode. Training (ep_len times).
            # if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
            if d or ep_len > opt.max_ep_len:
                sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote())

                # print('rollout ep_len:', ep_len * opt.action_repeat, 'ep_score:', ep_score,
                #       'ep_target_bias:', ep_target_bias)

                if steps > opt.start_steps:
                    # update parameters every episode
                    weights = ray.get(ps.pull.remote(keys))
                    agent.set_weights(keys, weights)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

                t_queue = 1
                if opt.model == "cnn":
                    compressed_o = pack(o)
                    o_queue.append((compressed_o,))
                else:
                    o_queue.append((o,))

                # for a_l_ratio control
                learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())

                while (actor_steps - last_actor_steps) / (
                        learner_steps - last_learner_steps + 1) > opt.a_l_ratio and last_learner_steps > 0:
                    time.sleep(1)
                    learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())