Пример #1
0
def worker_test(ps, replay_buffer, opt):
    agent = Actor(opt, job="main")

    test_env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise,
                       opt.reward_scale, 3)

    agent.test(ps, replay_buffer, opt, test_env)
Пример #2
0
def worker_rollout(ps, replay_buffer, opt, worker_index):

    # env = gym.make(opt.env_name)

    env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise,
                  opt.reward_scale, 3)

    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]

    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # epochs = opt.total_epochs // opt.num_workers
    total_steps = opt.steps_per_epoch * opt.total_epochs

    weights = ray.get(ps.pull.remote(keys))
    agent.set_weights(keys, weights)

    # TODO opt.start_steps
    # for t in range(total_steps):
    t = 0
    while True:
        if t > opt.start_steps:
            a = agent.get_action(o)
        else:
            a = env.action_space.sample()
        t += 1
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == opt.max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store.remote(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == opt.max_ep_len):
            sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote())

            while sample_times > 0 and steps / sample_times > opt.a_l_ratio:
                sample_times, steps, _ = ray.get(
                    replay_buffer.get_counts.remote())
                time.sleep(0.1)

            # update parameters every episode
            weights = ray.get(ps.pull.remote(keys))
            agent.set_weights(keys, weights)

            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
Пример #3
0
def worker_rollout(ps, replay_buffer, opt):
    agent = Actor(opt, job='worker', buffer=replay_buffer)
    
    while True:
        weights = ray.get(ps.pull.remote())
        agent.set_weights(weights)
        agent.run()
Пример #4
0
def worker_rollout(ps, replay_buffer, opt, worker_index):
    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]
    np.random.seed()

    ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    sys.path.append(ROOT)
    from trading_env import TradingEnv, FrameStack
    # ------ env set up ------
    # env = gym.make(opt.env_name)
    env = TradingEnv(action_scheme_id=3, obs_dim=38)

    while True:

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        # for a_l_ratio control
        np.random.seed()
        rand_buff = np.random.choice(opt.num_buffers, 1)[0]
        last_learner_steps, last_actor_steps, _size = ray.get(
            replay_buffer[rand_buff].get_counts.remote())

        while True:

            # don't need to random sample action if load weights from local.
            if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover:
                a = agent.get_action(o)
            else:
                a = env.action_space.sample()
            # Step the env
            o2, r, d, _ = env.step(a)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            np.random.seed()
            rand_buff = np.random.choice(opt.num_buffers, 1)[0]
            replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index)

            o = o2

            # End of episode. Training (ep_len times).
            # if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
            if d:
                break
Пример #5
0
def worker_test(ps, node_buffer, opt):
    agent = Actor(opt, job="test", buffer=ReplayBuffer)
    init_time = time.time()
    save_times = 0
    checkpoint_times = 0

    while True:
        weights = ray.get(ps.get_weights.remote())
        agent.set_weights(weights)
        start_actor_step, start_learner_step, _ = get_al_status(node_buffer)
        start_time = time.time()

        agent.run()

        last_actor_step, last_learner_step, _ = get_al_status(node_buffer)
        actor_step = np.sum(last_actor_step) - np.sum(start_actor_step)
        learner_step = np.sum(last_learner_step) - np.sum(start_learner_step)
        alratio = actor_step / (learner_step + 1)
        update_frequency = int(learner_step / (time.time() - start_time))
        total_learner_step = np.sum(last_learner_step)

        print("---------------------------------------------------")
        print("frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time)))
        print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step)
        print("actor leaner ratio: %.2f" % alratio)
        print("learner freq:", update_frequency)
        print("Ray total resources:", ray.cluster_resources())
        print("available resources:", ray.available_resources())
        print("---------------------------------------------------")

        total_time = time.time() - init_time

        if total_learner_step // opt.save_interval > save_times:
            with open(opt.save_dir + "/" + str(total_learner_step / 1e6) + "_weights.pickle", "wb") as pickle_out:
                pickle.dump(weights, pickle_out)
                print("****** Weights saved by time! ******")
            save_times = total_learner_step // opt.save_interval

        # save everything every checkpoint_freq s
        if total_time // opt.checkpoint_freq > checkpoint_times:
            print("save everything!")
            save_start_time = time.time()

            ps_save_op = [node_ps[i].save_weights.remote() for i in range(opt.num_nodes)]
            buffer_save_op = [node_buffer[node_index][model_type].save.remote() for model_type in model_types for node_index in range(opt.num_nodes)]
            ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes * 6)       #5 models + ps

            print("total time for saving :", time.time() - save_start_time)
            checkpoint_times = total_time // opt.checkpoint_freq
Пример #6
0
def worker_test(ps, replay_buffer, opt):

    agent = Actor(opt, job="main")

    keys, weights = agent.get_weights()

    time0 = time1 = time.time()
    sample_times1, steps, size = ray.get(replay_buffer.get_counts.remote())
    max_ret = -1000

    env = gym.make(opt.env_name)

    while True:
        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        ep_ret = agent.test(env, replay_buffer)
        sample_times2, steps, size = ray.get(replay_buffer.get_counts.remote())
        time2 = time.time()
        print("test_reward:", ep_ret, "sample_times:", sample_times2, "steps:",
              steps, "buffer_size:", size)
        print('update frequency:',
              (sample_times2 - sample_times1) / (time2 - time1), 'total time:',
              time2 - time0)

        if ep_ret > max_ret:
            ps.save_weights.remote()
            print("****** weights saved! ******")
            max_ret = ep_ret

        time1 = time2
        sample_times1 = sample_times2

        # if steps >= opt.total_epochs * opt.steps_per_epoch:
        #     exit(0)
        # if time2 - time0 > 30:
        #     exit(0)

        time.sleep(5)
Пример #7
0
def worker_rollout(ps, replay_buffer, opt, worker_index):

    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]

    filling_steps = 0
    while True:
        # ------ env set up ------
        env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise,
                      opt.reward_scale, 3)
        # ------ env set up end ------

        ################################## deques

        o_queue = deque([], maxlen=opt.Ln + 1)
        a_r_d_queue = deque([], maxlen=opt.Ln)

        ################################## deques

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        ################################## deques reset
        t_queue = 1
        if opt.model == "cnn":
            compressed_o = pack(o)
            o_queue.append((compressed_o, ))
        else:
            o_queue.append((o, ))

        ################################## deques reset

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        while True:

            # don't need to random sample action if load weights from local.
            if filling_steps > opt.start_steps or opt.weights_file:
                a = agent.get_action(o, deterministic=False)
            else:
                a = env.action_space.sample()
                filling_steps += 1
            # Step the env
            o2, r, d, _ = env.step(a)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            o = o2

            #################################### deques store

            a_r_d_queue.append((
                a,
                r,
                d,
            ))
            if opt.model == "cnn":
                compressed_o2 = pack(o2)
                o_queue.append((compressed_o2, ))
            else:
                o_queue.append((o2, ))

            # scheme 1:
            # TODO  and t_queue % 2 == 0: %1 lead to q smaller
            # TODO
            if t_queue >= opt.Ln and t_queue % opt.save_freq == 0:
                replay_buffer[np.random.choice(opt.num_buffers,
                                               1)[0]].store.remote(
                                                   o_queue, a_r_d_queue,
                                                   worker_index)

            t_queue += 1

            #################################### deques store

            # End of episode. Training (ep_len times).
            if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
                # TODO
                sample_times, steps, _ = ray.get(
                    replay_buffer[0].get_counts.remote())

                print('rollout_ep_len:', ep_len * opt.action_repeat,
                      'rollout_ep_ret:', ep_ret)

                if steps > opt.start_steps:
                    # update parameters every episode
                    weights = ray.get(ps.pull.remote(keys))
                    agent.set_weights(keys, weights)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

                ################################## deques reset
                t_queue = 1
                if opt.model == "cnn":
                    compressed_o = pack(o)
                    o_queue.append((compressed_o, ))
                else:
                    o_queue.append((o, ))
Пример #8
0
FLAGS = tf.app.flags.FLAGS

# "Pendulum-v0" 'BipedalWalker-v2' 'LunarLanderContinuous-v2'
flags.DEFINE_string("env_name", "BipedalWalkerHardcore-v2", "game env")
flags.DEFINE_integer("total_epochs", 500, "total_epochs")
flags.DEFINE_integer("num_workers", 1, "number of workers")
flags.DEFINE_integer("num_learners", 1, "number of learners")
flags.DEFINE_string(
    "is_restore", "False",
    "True or False. True means restore weights from pickle file.")
flags.DEFINE_float("a_l_ratio", 10, "steps / sample_times")

opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers,
                      FLAGS.a_l_ratio)

agent = Actor(opt, job="main")
keys, weights = agent.get_weights()
pickle_in = open("weights.pickle", "rb")
weights = pickle.load(pickle_in)

weights = [weights[key] for key in keys]

agent.set_weights(keys, weights)

test_env = gym.make(opt.env_name)

n = 2

rew = []
for j in range(n):
    o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
Пример #9
0
def worker_test(ps, node_buffer, opt):

    agent = Actor(opt, job="test")
    keys = agent.get_weights()[0]

    # test_env = gym.make(opt.env_name)
    ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    sys.path.append(ROOT)
    from trading_env import TradingEnv, FrameStack
    test_env = TradingEnv(action_scheme_id=3, obs_dim=38)

    init_time = time.time()
    save_times = 0
    checkpoint_times = 0

    while True:
        # weights_all for save it to local
        weights_all = ray.get(ps.get_weights.remote())
        weights = [weights_all[key] for key in keys]
        agent.set_weights(keys, weights)

        start_actor_step, start_learner_step, _ = get_al_status(node_buffer)
        start_time = time.time()

        ave_test_reward, ave_score = agent.test(test_env, 10)

        last_actor_step, last_learner_step, _ = get_al_status(node_buffer)
        actor_step = np.sum(last_actor_step) - np.sum(start_actor_step)
        learner_step = np.sum(last_learner_step) - np.sum(start_learner_step)
        alratio = actor_step / (learner_step + 1)
        update_frequency = int(learner_step / (time.time() - start_time))
        total_learner_step = np.sum(last_learner_step)

        print("---------------------------------------------------")
        print("average test reward:", ave_test_reward)
        print("average test score:", ave_score)
        print(
            "frame freq:",
            np.round((last_actor_step - start_actor_step) /
                     (time.time() - start_time)))
        print("actor_steps:", np.sum(last_actor_step), "learner_step:",
              total_learner_step)
        print("actor leaner ratio: %.2f" % alratio)
        print("learner freq:", update_frequency)
        print("Ray total resources:", ray.cluster_resources())
        print("available resources:", ray.available_resources())
        print("---------------------------------------------------")
        if learner_step < 100:
            alratio = 0
        agent.write_tb(ave_test_reward, ave_score, alratio, update_frequency,
                       total_learner_step)

        total_time = time.time() - init_time

        if total_learner_step // opt.save_interval > save_times:
            with open(
                    opt.save_dir + "/" + str(total_learner_step / 1e6) + "M_" +
                    str(ave_test_reward) + "_weights.pickle",
                    "wb") as pickle_out:
                pickle.dump(weights_all, pickle_out)
                print("****** Weights saved by time! ******")
            save_times = total_learner_step // opt.save_interval

        # save everything every checkpoint_freq s
        if total_time // opt.checkpoint_freq > checkpoint_times:
            print("save everything!")
            save_start_time = time.time()

            ps_save_op = [
                node_ps[i].save_weights.remote() for i in range(opt.num_nodes)
            ]
            buffer_save_op = [
                node_buffer[node_index][i].save.remote()
                for i in range(opt.num_buffers)
                for node_index in range(opt.num_nodes)
            ]
            ray.wait(buffer_save_op + ps_save_op,
                     num_returns=opt.num_nodes * opt.num_buffers + 1)

            print("total time for saving :", time.time() - save_start_time)
            checkpoint_times = total_time // opt.checkpoint_freq
Пример #10
0
def worker_test(ps, replay_buffer, opt):
    agent = Actor(opt, job="main")
    test_env = TradingEnv()
    agent.test(ps, replay_buffer, opt, test_env)
Пример #11
0
def worker_rollout(ps, replay_buffer, opt, worker_index):
    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]
    np.random.seed()
    rand_buff1 = np.random.choice(opt.num_buffers, 1)[0]

    random_steps = 0

    while True:
        # ------ env set up ------

        env = TradingEnv()
        # env = Wrapper(env, opt.action_repeat, opt.reward_scale)
        # ------ env set up end ------

        o_queue = deque([], maxlen=opt.Ln + 1)
        a_r_d_queue = deque([], maxlen=opt.Ln)

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        ep_score, ep_target_bias = 0, 0

        if opt.model == "cnn":
            compressed_o = pack(o)
            o_queue.append((compressed_o,))
        else:
            o_queue.append((o,))

        t_queue = 1

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        # for a_l_ratio control
        np.random.seed()
        rand_buff = np.random.choice(opt.num_buffers, 1)[0]
        last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())

        while True:

            # don't need to random sample action if load weights from local.
            if random_steps > opt.start_steps or opt.weights_file or opt.recover:
                a = agent.get_action(o, deterministic=False)
            else:
                a = env.action_space.sample()
                random_steps += 1
            # Step the env
            o2, r, d, info = env.step(a)

            ep_ret += r
            ep_score += info['score']
            ep_target_bias += info['target_bias']
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            o = o2

            a_r_d_queue.append((a, r, d,))
            if opt.model == "cnn":
                compressed_o2 = pack(o2)
                o_queue.append((compressed_o2,))
            else:
                o_queue.append((o2,))

            # scheme 1:
            # TODO  and t_queue % 2 == 0: %1 lead to q smaller
            # TODO
            if t_queue >= opt.Ln and t_queue % opt.save_freq == 0:
                replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index)

            t_queue += 1

            # End of episode. Training (ep_len times).
            # if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
            if d or ep_len > opt.max_ep_len:
                sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote())

                # print('rollout ep_len:', ep_len * opt.action_repeat, 'ep_score:', ep_score,
                #       'ep_target_bias:', ep_target_bias)

                if steps > opt.start_steps:
                    # update parameters every episode
                    weights = ray.get(ps.pull.remote(keys))
                    agent.set_weights(keys, weights)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

                t_queue = 1
                if opt.model == "cnn":
                    compressed_o = pack(o)
                    o_queue.append((compressed_o,))
                else:
                    o_queue.append((o,))

                # for a_l_ratio control
                learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())

                while (actor_steps - last_actor_steps) / (
                        learner_steps - last_learner_steps + 1) > opt.a_l_ratio and last_learner_steps > 0:
                    time.sleep(1)
                    learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())
Пример #12
0
def worker_test(ps, replay_buffer, opt):
    agent = Actor(opt, job="main")
    test_env = gym.make(opt.env_name)
    agent.test(ps, replay_buffer, opt, test_env)