示例#1
0
    def doTestDDPG(self):
        np.random.seed(0)
        env = gym.make("Pendulum-v0")
        env.seed(0)
        ddpg_g = tf.Graph()
        with ddpg_g.as_default():
            tf.set_random_seed(123)
            agent = agents[DDPG_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                DDPG_AGENT_CONFIG,
                DDPG_MODEL_CONFIG,
                distributed_spec={})
        reward_window = WindowStat("reward", 25)
        obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
        ), list()
        act_count = 0

        for i in range(200):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act(
                    [ob], False, use_perturbed_action=False)
                act_count += 1
                next_ob, reward, done, info = env.step(action[0])
                obs.append(ob)
                actions.append(action[0])
                rewards.append(0.1 * reward)
                next_obs.append(next_ob)
                dones.append(done)
                if agent.ready_to_send:
                    agent.send_experience(
                        obs=obs,
                        actions=actions,
                        rewards=rewards,
                        dones=dones,
                        next_obs=next_obs)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                    if DDPG_AGENT_CONFIG.get("prioritized_replay", False):
                        agent.update_priorities(
                            indexes=batch_data["indexes"],
                            td_error=res["td_error"])

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            agent.add_episode(1)
            reward_window.push(episode_reward)

        return reward_window.stats()["reward_mean"]
示例#2
0
    def doTestPPO(self):
        env = gym.make("CartPole-v0")
        env.seed(0)
        ppo_g = tf.Graph()
        with ppo_g.as_default():
            tf.set_random_seed(123)
            agent = agents[PPO_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                PPO_AGENT_CONFIG,
                PPO_MODEL_CONFIG,
                distributed_spec={})

        reward_window = WindowStat("reward", 25)
        obs, actions, rewards, next_obs, dones, value_preds, logits = list(
        ), list(), list(), list(), list(), list(), list()
        act_count = 0

        for i in range(300):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob], False)
                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                obs.append(ob)
                actions.append(action[0])
                rewards.append(0.1 * reward)
                next_obs.append(next_ob)
                dones.append(done)

                logits.append(results["logits"][0])
                value_preds.append(results["value_preds"][0])
                if agent.ready_to_send:
                    agent.send_experience(
                        obs=obs,
                        actions=actions,
                        rewards=rewards,
                        dones=dones,
                        next_obs=next_obs,
                        value_preds=value_preds,
                        logits=logits)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            reward_window.push(episode_reward)

        return reward_window.stats()["reward_mean"]
示例#3
0
def main():
    env = gym.make("CartPole-v0")

    agent_class = agents[AGENT_CONFIG["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        AGENT_CONFIG,
                        MODEL_CONFIG,
                        distributed_spec={},
                        export_dir="dump_dir",
                        checkpoint_dir="ckpt_dir")

    reward_window = WindowStat("reward", 50)
    length_window = WindowStat("length", 50)
    loss_window = WindowStat("loss", 50)
    obs, actions, rewards, next_obs, dones, value_preds = list(), list(), list(
    ), list(), list(), list()
    act_count = 0

    for i in range(600):
        ob = env.reset()
        done = False
        episode_reward = .0
        episode_len = 0

        while not done:
            action, results = agent.act([ob], False)
            next_ob, reward, done, info = env.step(action[0])
            act_count += 1

            obs.append(ob)
            actions.append(action[0])
            rewards.append(0.1 * reward)
            next_obs.append(next_ob)
            dones.append(done)
            value_preds.append(results["value_preds"][0])
            if agent.ready_to_send:
                agent.send_experience(obs=obs,
                                      actions=actions,
                                      rewards=rewards,
                                      next_obs=next_obs,
                                      dones=dones,
                                      value_preds=value_preds)
            if agent.ready_to_receive:
                batch_data = agent.receive_experience()
                res = agent.learn(batch_data)
                loss_window.push(res['loss'])

            ob = next_ob
            episode_reward += reward
            episode_len += 1

            if act_count % 1000 == 0:
                print("timestep:", act_count, reward_window, length_window)

        reward_window.push(episode_reward)
        length_window.push(episode_len)

    agent.export_saved_model()
    print("Done.")
def main():
    with open(FLAGS.config, 'r') as ips:
        config = json.load(ips)
        print(config)

    job_name = FLAGS.job_name

    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(
        env=env,
        frame_stack=True,
        clip_rewards=False,
        episode_life=True,
        wrap_frame=True,
        frame_resize=42)

    agent_class = agents[config["agent"]["type"]]
    agent = agent_class(
        env.observation_space,
        env.action_space,
        config["agent"],
        config["model"],
        distributed_spec={
            "ps_hosts": FLAGS.ps_hosts,
            "memory_hosts": FLAGS.memory_hosts,
            "actor_hosts": FLAGS.actor_hosts,
            "learner_hosts": FLAGS.learner_hosts,
            "job_name": FLAGS.job_name,
            "task_index": FLAGS.task_index
        },
        custom_model=MyVTmodel,
        checkpoint_dir=None)
    all_cost = time.time()
    if job_name == "ps":
        print("ps starts===>")
        agent.join()
    elif job_name == "memory":
        start_tt = time.time()
        log_count = 0
        print("memory starts===>")
        while not agent.should_stop():
            agent.communicate()
            if time.time() - start_tt > log_count:
                log_count += 1
                print(agent._receive_count, "actor2mem_q:",
                      agent._actor2mem_q.qsize(), "mem2learner_2:",
                      agent._mem2learner_q.qsize())
                sys.stdout.flush()
    elif job_name == "actor":
        print("actor starts===>")
        start_tt = time.time()
        log_count = 0
        act_log_count = 0

        # create vectorized env
        def make_env(rank):
            def make_atari_env():
                env = make_atari("PongNoFrameskip-v4")
                env = wrap_deepmind(
                    env=env,
                    frame_stack=True,
                    clip_rewards=False,
                    episode_life=True,
                    wrap_frame=True,
                    frame_resize=42)
                env.seed(rank)
                return env

            return make_atari_env

        num_env = config["agent"].get("num_env", 1)
        vec_env = VectorizedEnvironment(
            make_env=make_env, num_env=num_env, seed=100 * FLAGS.task_index)

        act_count = 0
        reward_window = WindowStat("reward", 10)
        length_window = WindowStat("length", 10)
        obs, actions, rewards, dones, logits = list(), list(), list(), list(
        ), list()
        agent.sync_vars()

        while not agent.should_stop():
            ob = vec_env.reset()
            episode_reward = np.zeros(num_env, )
            episode_len = np.zeros(num_env, )

            while not agent.should_stop():
                action, results = agent.act(ob, False)
                act_count += 1

                new_ob, reward, done, info = vec_env.step(action)

                obs.append(ob)
                actions.append(action)

                rewards.append(reward)
                dones.append(done)
                logits.append(results["logits"])

                if agent.ready_to_send:

                    agent.send_experience(
                        obs=obs,
                        actions=actions,
                        rewards=rewards,
                        dones=dones,
                        logits=logits,
                        vec_env=True,
                        num_env=num_env)
                    agent.sync_vars()

                ob = new_ob

                episode_reward += np.asarray(reward)
                episode_len += 1
                for i in range(num_env):
                    if done[i]:
                        reward_window.push(episode_reward[i])
                        length_window.push(episode_len[i])
                        episode_reward[i] = .0
                        episode_len[i] = 0
                total_cost = time.time() - start_tt
                if int(total_cost / 5) > log_count:
                    log_count += 1
                    print("act_count:", act_count, "actor2mem_q:",
                          agent._actor2mem_q.qsize(), "total:", total_cost)
                    print('total_cost:', total_cost, reward_window)
                    print(length_window)
                    sys.stdout.flush()
                if int((act_count * num_env) / 10000) > act_log_count:
                    act_log_count += 1
                    print('timestep:', act_log_count * 10000, reward_window)

    elif job_name == "learner":
        print("learner starts===>")
        start_tt = time.time()
        train_count = 0
        try:
            while not agent.should_stop():
                batch_data = agent.receive_experience()
                if batch_data:

                    extra_data = agent.learn(batch_data)
                    train_count += 1
                    print("learning {}".format(extra_data), "receive_q:",
                          agent._receive_q.qsize())
                    print("train_count:", train_count, "total:",
                          time.time() - start_tt)
                    sys.stdout.flush()

        except tf.errors.OutOfRangeError as e:
            print("memory has stopped.")
    else:
        raise ValueError("Invalid job_name.")
    all_cost = time.time() - all_cost
    print("done. all_cost:", all_cost)
示例#5
0
def main():
    env = gym.make("CartPole-v0")
    env.seed(0)

    agent_class = agents[AGENT_CONFIG["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        AGENT_CONFIG,
                        MODEL_CONFIG,
                        distributed_spec={},
                        checkpoint_dir="ckpt_dir",
                        custom_model=MyESmodel)

    reward_window = WindowStat("reward", 50)
    length_window = WindowStat("length", 50)

    init_perturbation_scale = 0.1

    seeds, rewards, perturbation_scales = list(), list(), list()
    is_positive_direction = list()

    # how many episodes needed for one trial
    episode_per_perturbation = 1

    returns = list()

    for i in range(4000):
        ob = env.reset()
        done = False
        episode_reward = .0
        episode_len = 0

        if i % episode_per_perturbation == 0:
            # perturb parameters every `episode_per_seed` episodes
            is_positive = True if len(
                is_positive_direction
            ) == 0 else is_positive_direction[-1] != True

            # each seed twice
            seed = np.random.randint(1000000) if is_positive else seeds[-1]
            perturbation_scale = max(
                init_perturbation_scale * (1 - i / 2000.0), 0.02)

            feed = agent.model.perturbation_feed
            fetch = [agent.model.reset_perturbation_op]

            agent.executor.run(fetches=fetch,
                               feed_dict={
                                   feed['perturbation_seeds']: [seed],
                                   feed['perturbation_scales']:
                                   [perturbation_scale],
                                   feed['positive_perturbation']:
                                   is_positive
                               })

            if is_positive:
                seeds.append(seed)
                perturbation_scales.append(perturbation_scale)
            is_positive_direction.append(is_positive)

        while not done:
            action, result = agent.act([ob], True, use_perturbed_action=True)

            next_ob, reward, done, info = env.step(action[0])

            ob = next_ob
            episode_reward += reward
            episode_len += 1

        rewards.append(episode_reward)
        reward_window.push(episode_reward)
        length_window.push(episode_len)
        if len(rewards) == episode_per_perturbation:
            returns.append(np.mean(rewards))
            rewards = []
            if len(returns) == 2 * agent.config.get('sample_batch_size', 100):
                print(reward_window)
                assert len(seeds) == (len(returns) / 2)
                assert len(perturbation_scales) == (len(returns) / 2)
                agent.learn(
                    batch_data=dict(perturbation_seeds=seeds,
                                    perturbation_scales=perturbation_scales,
                                    returns=np.reshape(returns, [-1, 2])))
                seeds = []
                perturbation_scales = []
                returns = []
                is_positive_direction = []

                # evaluation 20 episodes
                test_rewards = list()
                for j in range(20):
                    done = False
                    ob = env.reset()
                    episode_reward = 0
                    episode_len = 0
                    while not done:
                        action, result = agent.act([ob],
                                                   True,
                                                   use_perturbed_action=False)

                        next_ob, reward, done, info = env.step(action[0])

                        ob = next_ob
                        episode_reward += reward
                        episode_len += 1
                    test_rewards.append(episode_reward)
                print("[evaluation] average reward of 20 episodes:",
                      np.mean(test_rewards))

                print('train at ', i)

    agent.export_saved_model(export_dir="dump_dir")
    print("Done.")
def main(_):
    with open(FLAGS.config, 'r') as ips:
        config = json.load(ips)
        print(config)

    env = gym.make("CartPole-v0")
    env.seed(0)

    agent_class = agents[config["agent"]["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        config["agent"],
                        config["model"],
                        distributed_spec={
                            "ps_hosts": FLAGS.ps_hosts,
                            "memory_hosts": FLAGS.memory_hosts,
                            "actor_hosts": FLAGS.actor_hosts,
                            "learner_hosts": FLAGS.learner_hosts,
                            "job_name": FLAGS.job_name,
                            "task_index": FLAGS.task_index
                        },
                        custom_model=MyDQN)

    if FLAGS.job_name == "ps":
        print("ps starts===>")
        agent.join()
    elif FLAGS.job_name == "memory":
        print("memory starts===>")
        while not agent.should_stop():
            agent.communicate()
            sys.stdout.flush()
    elif FLAGS.job_name == "actor":
        print("actor starts===>")
        reward_window = WindowStat("reward", 50)
        length_window = WindowStat("length", 50)
        obs, actions, rewards, new_obs, dones = list(), list(), list(), list(
        ), list()
        agent.sync_vars()

        while not agent.should_stop():
            ob = env.reset()
            done = False
            episode_reward = .0
            episode_len = 0

            while not done and not agent.should_stop():
                action, results = agent.act([ob], False)

                new_ob, reward, done, info = env.step(action[0])

                obs.append(ob)
                actions.append(action[0])
                rewards.append(reward)
                new_obs.append(new_ob)
                dones.append(done)
                if agent.ready_to_send:
                    agent.send_experience(obs=obs,
                                          actions=actions,
                                          rewards=rewards,
                                          next_obs=new_obs,
                                          dones=dones)
                    agent.sync_vars()
                ob = new_ob
                episode_reward += reward
                episode_len += 1

            reward_window.push(episode_reward)
            length_window.push(episode_len)
            print(reward_window)
            print(length_window)
            sys.stdout.flush()
    elif FLAGS.job_name == "learner":
        print("learner starts===>")
        while not agent.should_stop():
            batch_data = agent.receive_experience()
            if batch_data:
                extra_data = agent.learn(batch_data)
                print("learning {}".format(extra_data))
                sys.stdout.flush()
    else:
        raise ValueError("Invalid job_name.")

    print("done.")
示例#7
0
def main():
    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(env=env,
                        frame_stack=True,
                        clip_rewards=False,
                        episode_life=True,
                        wrap_frame=True,
                        frame_resize=42)

    agent_class = agents[AGENT_CONFIG["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        AGENT_CONFIG,
                        MODEL_CONFIG,
                        distributed_spec={},
                        custom_model=MyDQNModel)

    reward_window = WindowStat("reward", 10)
    length_window = WindowStat("length", 10)
    loss_window = WindowStat("loss", 10)
    obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
    ), list()

    for i in range(2000):
        ob = env.reset()
        ob = np.asarray(ob)
        done = False
        episode_reward = .0
        episode_len = 0

        while not done:
            action, results = agent.act([ob],
                                        deterministic=False,
                                        use_perturbed_action=False)

            next_ob, reward, done, info = env.step(action[0])
            next_ob = np.asarray(next_ob)

            obs.append(ob)
            actions.append(action[0])
            rewards.append(reward)
            next_obs.append(next_ob)
            dones.append(done)
            if agent.ready_to_send:
                agent.send_experience(obs=obs,
                                      actions=actions,
                                      rewards=rewards,
                                      next_obs=next_obs,
                                      dones=dones)
            if agent.ready_to_receive:
                batch_data = agent.receive_experience()
                res = agent.learn(batch_data)
                loss_window.push(res['loss'])

                if AGENT_CONFIG.get("prioritized_replay", False):
                    agent.update_priorities(indexes=batch_data["indexes"],
                                            td_error=res["td_error"])

            ob = next_ob
            episode_reward += reward
            episode_len += 1
        agent.add_episode(1)
        reward_window.push(episode_reward)
        length_window.push(episode_len)
        if i % 10 == 0:
            print('episode at', i)
            print(reward_window)
            print(length_window)
            print(loss_window)

    print("Done.")
示例#8
0
    def doTestCkpt(self):
        trial_timestamp = time.strftime("%Y%m%d-%H%M%S")
        np.random.seed(0)
        env = gym.make("CartPole-v0")
        env.seed(0)
        dqn_g = tf.Graph()
        with dqn_g.as_default():
            tf.set_random_seed(123)
            agent = agents[DQN_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                DQN_AGENT_CONFIG,
                DQN_MODEL_CONFIG,
                checkpoint_dir="ckpt_dir_{}".format(trial_timestamp),
                distributed_spec={})
        reward_window = WindowStat("reward", 50)
        obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
        ), list()
        act_count = 0

        for i in range(500):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob],
                                            deterministic=False,
                                            use_perturbed_action=False)

                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                obs.append(ob)
                actions.append(action[0])
                rewards.append(reward)
                next_obs.append(next_ob)
                dones.append(done)
                if agent.ready_to_send:
                    agent.send_experience(obs=obs,
                                          actions=actions,
                                          rewards=rewards,
                                          next_obs=next_obs,
                                          dones=dones)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                    if DQN_AGENT_CONFIG.get("prioritized_replay", False):
                        agent.update_priorities(indexes=batch_data["indexes"],
                                                td_error=res["td_error"])

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            agent.add_episode(1)
            reward_window.push(episode_reward)

        prev_perf = reward_window.stats()["reward_mean"]
        print("Performance before saving is {}".format(prev_perf))

        new_dqn_g = tf.Graph()
        with new_dqn_g.as_default():
            agent = agents[DQN_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                DQN_AGENT_CONFIG,
                DQN_MODEL_CONFIG,
                checkpoint_dir="ckpt_dir_{}".format(trial_timestamp),
                distributed_spec={})
        reward_window = WindowStat("reward", 10)
        ob = env.reset()
        for i in range(10):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob],
                                            deterministic=True,
                                            use_perturbed_action=False)

                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                ob = next_ob
                episode_reward += reward

            agent.add_episode(1)
            reward_window.push(episode_reward)

        cur_perf = reward_window.stats()["reward_mean"]
        print("Performance after restore is {}".format(cur_perf))
        return prev_perf - cur_perf
示例#9
0
def main():
    # create offline_env
    env = offline_env(FLAGS.train_data_file, batch_size=128, n_step=MODEL_CONFIG.get("n_step", 1))
    eval_env = offline_env(FLAGS.eval_data_file, batch_size=128, n_step=MODEL_CONFIG.get("n_step", 1))

    agent_class = agents[AGENT_CONFIG["type"]]
    agent = agent_class(
        env.observation_space,
        env.action_space,
        AGENT_CONFIG,
        MODEL_CONFIG,
        distributed_spec={},
        export_dir="bcq_tmp",
        checkpoint_dir="bcq_tmp",
        custom_model=MyBCQModel)

    clone_loss_window = WindowStat("clone_loss", 50)
    clone_reg_loss_window = WindowStat("clone_reg_loss", 50)
    loss_window = WindowStat("loss", 50)

    total_cost = time.time()
    clone_learn_count = 0

    # first, train a generative model by behavior clone
    for i in range(1000):
        table_data = env.reset()
        # store raw data in replay buffer
        agent.send_experience(
            obs=table_data["obs"],
            actions=table_data["actions"],
            rewards=table_data["rewards"],
            dones=table_data["dones"],
            next_obs=table_data["next_obs"])

        # sample from replay buffer
        # the size of sampled data is equal to `AGENT_CONFIG["batch_size"]`
        batch_data = agent.receive_experience()
        clone_loss, clone_reg_loss = agent.behavior_learn(batch_data=batch_data)
        clone_learn_count += 1
        clone_loss_window.push(clone_loss)
        clone_reg_loss_window.push(clone_reg_loss)
        if i % 50 == 0:
            print(clone_loss_window)
            print(clone_reg_loss_window)

    # eval generative model
    all_clone_act, gd_act = [], []
    for i in range(100):
        table_data = env.reset()
        clone_act = agent.behavior_act(table_data["obs"])
        all_clone_act.extend(np.argsort(-1.0 * clone_act, axis=1).tolist())
        gd_act.extend(table_data["actions"])
    acc1 = np.sum(np.array(all_clone_act)[:, 0] == np.array(gd_act))*1.0/len(gd_act)
    print("acc @top1:", acc1)

    # second, train bcq
    agent.reset_global_step()

    epochs_to_end = 10
    max_globel_steps_to_end = 10000
    learn_count = 0
    env.num_epoch = 0
    while env.num_epoch < epochs_to_end and learn_count < max_globel_steps_to_end:
        table_data = env.reset()

        # store raw data in replay buffer
        agent.send_experience(
            obs=table_data["obs"],
            actions=table_data["actions"],
            rewards=table_data["rewards"],
            dones=table_data["dones"],
            next_obs=table_data["next_obs"])

        # sample from replay buffer
        # the size of sampled data is equal to `AGENT_CONFIG["batch_size"]`
        batch_data = agent.receive_experience()
        # update the model
        res = agent.learn(batch_data)
        # record the loss
        loss_window.push(res["loss"])
        learn_count += 1

        if AGENT_CONFIG.get("prioritized_replay", False):
            # update priorities
            agent.update_priorities(
                indexes=batch_data["indexes"],
                td_error=res["td_error"])

        if learn_count % 50 == 0:
            print("learn_count:", learn_count)
            print(loss_window)

            # offline evaluation
            batch_weights = []
            batch_rewards = []
            eval_num = 50
            for _ in range(eval_num):
                batch_data = eval_env.reset()

                importance_ratio = agent.importance_ratio(batch_data)
                batch_weights.append(importance_ratio)
                batch_rewards.append(batch_data["rewards"])
            ips, ips_sw, wips, wips_sw, wips_sw_mean = ips_eval(
                batch_weights=batch_weights, batch_rewards=batch_rewards, gamma=MODEL_CONFIG.get("gamma", 0.95))

            agent.add_extra_summary({agent.model.ips_score_op:ips,
                                     agent.model.ips_score_stepwise_op:ips_sw,
                                     agent.model.wnorm_ips_score_op:wips,
                                     agent.model.wnorm_ips_score_stepwise_op:wips_sw,
                                     agent.model.wnorm_ips_score_stepwise_mean_op:wips_sw_mean})
            print("[IPS Policy Evaluation @learn_count={}] ips={}, ips_stepwise={}, wnorm_ips={}, wnorm_ips_stepwise={}, wnorm_ips_stepwise_mean={}".format(
                learn_count, ips, ips_sw, wips, wips_sw, wips_sw_mean))

        if learn_count % 2000 == 0:
            # export saved model at any time
            # AssertionError will occur if the export_dir already exists.
            agent.export_saved_model("bcq_export_dir{}".format(learn_count))

        if learn_count % 200 == 0:
            # test with simulator
            gym_env = gym.make("CartPole-v0")
            for ix in range(10):
                ob = gym_env.reset()
                done = False
                episode_reward = .0

                while not done:
                    action, results = agent.act(
                        [ob], deterministic=False, use_perturbed_action=False)

                    next_ob, reward, done, info = gym_env.step(action[0])
                    episode_reward += reward
                    ob = next_ob
                print("train@", learn_count, "test@", ix, "reward:", episode_reward)

    print("Done.", "num_epoch:", env.num_epoch, "learn_count:", learn_count,
          "total_cost:", time.time() - total_cost)
示例#10
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
          Max number of transitions to store in the buffer. When the buffer
          overflows the old memories are dropped.
        alpha: float
          how much prioritization is used
          (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._prio_change_stats = WindowStat("reprio", 1000)

        self._debug_cost = 0

    def add(self, obs, actions, rewards, dones, next_obs, weights, **kwargs):
        """See ReplayBuffer.store_effect"""

        super(PrioritizedReplayBuffer, self).add(
            obs=obs,
            actions=actions,
            rewards=rewards,
            dones=dones,
            next_obs=next_obs,
            **{})

        if weights is None:
            weights = self._max_priority
            constant_weight = weights**self._alpha
            for idx in self._cover_indices:
                self._it_sum[idx] = constant_weight
                self._it_min[idx] = constant_weight
        else:
            weights = np.power(weights, self._alpha)
            for n, idx in enumerate(self._cover_indices):
                self._it_sum[idx] = weights[n]
                self._it_min[idx] = weights[n]

    def _sample_proportional(self, batch_size):
        res = []
        sum_value = self._it_sum.sum(0, len(self))
        mass = np.random.random(size=batch_size) * sum_value
        for i in range(batch_size):
            # TODO(szymon): should we ensure no repeats?
            idx = self._it_sum.find_prefixsum_idx(mass[i])
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
          How many transitions to sample.
        beta: float
          To what degree to use importance weights
          (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
          batch of observations
        act_batch: np.array
          batch of actions executed given obs_batch
        rew_batch: np.array
          rewards received as results of executing act_batch
        next_obs_batch: np.array
          next set of observations seen after executing act_batch
        done_mask: np.array
          done_mask[i] = 1 if executing act_batch[i] resulted in
          the end of an episode and 0 otherwise.
        weights: np.array
          Array of shape (batch_size,) and dtype np.float32
          denoting importance weight of each sampled transition
        idxes: np.array
          Array of shape (batch_size,) and dtype np.int32
          idexes in buffer of sampled experiences
        """
        assert beta > 0
        self._num_sampled += batch_size

        start = time.time()
        idxes = self._sample_proportional(batch_size)
        self._debug_cost += time.time() - start

        sum_value = self._it_sum.sum()

        weights = []
        p_min = self._it_min.min() / sum_value
        max_weight = (p_min * len(self))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / sum_value
            weight = (p_sample * len(self))**(-beta)
            weights.append(weight / max_weight)
        weights = np.asarray(weights)
        encoded_sample = self._encode_sample(idxes)
        encoded_sample["weights"] = weights
        encoded_sample["indexes"] = idxes
        return encoded_sample

    def update_priorities(self, indexes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        indexes: [int]
          List of idxes of sampled transitions
        priorities: [float]
          List of updated priorities corresponding to
          transitions at the sampled idxes denoted by
          variable `idxes`.
        """
        assert len(indexes) == len(priorities)
        pvs = np.power(priorities, self._alpha).astype(np.float64)
        for idx, priority, pv in zip(indexes, priorities, pvs):
            assert priority > 0
            assert 0 <= idx < len(self)
            delta = pv - self._it_sum[idx]
            self._prio_change_stats.push(delta)
            self._it_sum[idx] = pv
            self._it_min[idx] = pv

        self._max_priority = max(self._max_priority, np.max(priorities))

    def stats(self, debug=False):
        parent = ReplayBuffer.stats(self, debug)
        if debug:
            parent.update(self._prio_change_stats.stats())
        return parent
示例#11
0
class ReplayBuffer(object):
    """Basic replay buffer.

    Support O(1) `add` and O(1) `sample` operations (w.r.t. each transition).
    The buffer is implemented as a fixed-length list where the index of insertion is reset to zero,
    once the list length is reached.
    """

    def __init__(self, size):
        """Create the replay buffer.

        Parameters
        ----------
        size: int
          Max number of transitions to store in the buffer. When the buffer
          overflows the old memories are dropped.
        """
        self._maxsize = size
        self._next_idx = 0
        self._hit_count = np.zeros(size)
        self._eviction_started = False
        self._num_added = 0
        self._num_sampled = 0
        self._evicted_hit_stats = WindowStat("evicted_hit", 1000)
        self._est_size_bytes = 0

        self._extra_fields = None

        self._first_add = True

    def __len__(self):
        return min(self._num_added, self._maxsize)

    def add(self,
            obs,
            actions,
            rewards,
            dones,
            next_obs=None,
            weights=None,
            **kwargs):

        batch_size = np.shape(rewards)[0]
        assert batch_size < self._maxsize, "size of data added in buffer is too big at once"

        truncated_size = min(batch_size, self._maxsize - self._next_idx)
        extra_size = max(0, batch_size - (self._maxsize - self._next_idx))

        if self._extra_fields is None:
            self._extra_fields = list(kwargs.keys())

        if self._first_add:
            self._obs = np.zeros(
                shape=((self._maxsize, ) + np.shape(obs)[1:]), dtype=obs.dtype)
            self._actions = np.zeros(
                shape=((self._maxsize, ) + np.shape(actions)[1:]),
                dtype=actions.dtype)
            self._rewards = np.zeros(shape=(self._maxsize, ), dtype=np.float32)

            if next_obs is not None:
                self._next_obs = np.zeros(
                    shape=((self._maxsize, ) + np.shape(next_obs)[1:]),
                    dtype=next_obs.dtype)

            if weights is not None:
                self._weights = np.zeros(
                    shape=((self._maxsize, )), dtype=np.float32)

            self._dones = np.zeros(shape=(self._maxsize, ), dtype=np.float32)

            self._extras = {
                name: np.zeros(
                    shape=((self._maxsize, ) + np.shape(kwargs[name])[1:]),
                    dtype=kwargs[name].dtype)
                for name in self._extra_fields
            }

            self._first_add = False

        self._num_added += batch_size

        #if self._num_added <= self._maxsize:
        #self._est_size_bytes += sum(sys.getsizeof(d) for d in data)

        self._obs[self._next_idx:self._next_idx +
                  truncated_size] = obs[:truncated_size]
        self._actions[self._next_idx:self._next_idx +
                      truncated_size] = actions[:truncated_size]
        self._rewards[self._next_idx:self._next_idx +
                      truncated_size] = rewards[:truncated_size]
        self._dones[self._next_idx:self._next_idx +
                    truncated_size] = dones[:truncated_size]

        if next_obs is not None:
            self._next_obs[self._next_idx:self._next_idx +
                           truncated_size] = next_obs[:truncated_size]
        if weights is not None:
            self._weights[self._next_idx:self._next_idx +
                          truncated_size] = weights[:truncated_size]

        for name in self._extras.keys():
            self._extras[name][self._next_idx:self._next_idx +
                               truncated_size] = kwargs[name][:truncated_size]

        if extra_size > 0:
            self._obs[:extra_size] = obs[truncated_size:]
            self._actions[:extra_size] = actions[truncated_size:]
            self._rewards[:extra_size] = rewards[truncated_size:]
            self._dones[:extra_size] = dones[truncated_size:]
            if next_obs is not None:
                self._next_obs[:extra_size] = next_obs[truncated_size:]
            if weights is not None:
                self._weights[:extra_size] = weights[truncated_size:]

            for name in self._extras.keys():
                self._extras[name][:extra_size] = kwargs[name][truncated_size:]

        if self._next_idx + batch_size >= self._maxsize:
            self._eviction_started = True
        self._cover_indices = [
            self._next_idx + i for i in range(truncated_size)
        ]
        if extra_size > 0:
            self._cover_indices += [i for i in range(extra_size)]
        self._next_idx = (self._next_idx + batch_size) % self._maxsize
        if self._eviction_started:
            for i in self._cover_indices:
                self._evicted_hit_stats.push(self._hit_count[i])
                self._hit_count[i] = 0

    def _encode_sample(self, idxes):
        idxes = np.asarray(idxes)

        obs = np.take(self._obs, indices=idxes, axis=0)
        actions = np.take(self._actions, indices=idxes, axis=0)
        rewards = np.take(self._rewards, indices=idxes, axis=0)
        next_obs = np.take(self._next_obs, indices=idxes, axis=0)
        dones = np.take(self._dones, indices=idxes, axis=0)

        batch_data = dict(
            obs=obs,
            actions=actions,
            rewards=rewards,
            dones=dones,
            next_obs=next_obs)

        return batch_data

    def sample(self, batch_size):
        """Sample a batch of experiences.

        Parameters
        ----------
        batch_size: int
            How many transitions to sample.

        Returns
        -------
        obs_batch: np.array
          batch of observations
        act_batch: np.array
          batch of actions executed given obs_batch
        rew_batch: np.array
          rewards received as results of executing act_batch
        next_obs_batch: np.array
          next set of observations seen after executing act_batch
        done_mask: np.array
          done_mask[i] = 1 if executing act_batch[i] resulted in
          the end of an episode and 0 otherwise.
        """
        idxes = np.random.randint(
            0, min(self._num_added, self._maxsize) - 1, size=(batch_size, ))
        self._num_sampled += batch_size
        return self._encode_sample(idxes)

    def stats(self, debug=False):
        data = {
            "added_count": self._num_added,
            "sampled_count": self._num_sampled,
            "est_size_bytes": self._est_size_bytes,
            "num_entries": len(self),
        }
        if debug:
            data.update(self._evicted_hit_stats.stats())
        return data
示例#12
0
def main():
    gym_env = gym.make("CartPole-v0")

    atari_env = make_atari("PongNoFrameskip-v4")
    atari_env = wrap_deepmind(env=atari_env,
                              frame_stack=True,
                              clip_rewards=False,
                              episode_life=True,
                              wrap_frame=True,
                              frame_resize=42)

    # replace the following env according to your saved_model
    # env = atari_env
    env = gym_env

    with tf.Session() as sess:
        path = 'dump_dir'
        MetaGraphDef = tf.saved_model.loader.load(
            sess, tags=[sm.tag_constants.SERVING], export_dir=path)

        # get SignatureDef protobuf
        SignatureDef_d = MetaGraphDef.signature_def
        SignatureDef = SignatureDef_d["predict_results"]

        # get inputs/outputs TensorInfo protobuf
        ph_inputs = {}
        for name, ts_info in SignatureDef.inputs.items():
            ph_inputs[name] = sm.utils.get_tensor_from_tensor_info(
                ts_info, sess.graph)

        outputs = {}
        for name, ts_info in SignatureDef.outputs.items():
            outputs[name] = sm.utils.get_tensor_from_tensor_info(
                ts_info, sess.graph)

        for name, ph in ph_inputs.items():
            print(name, ph)

        for name, ts in outputs.items():
            print(name, ts)

        len_window = WindowStat("length", 50)
        reward_window = WindowStat("reward", 50)
        for i in range(100):
            ob = env.reset()
            env.render()
            time.sleep(0.2)
            done = False
            episode_len = 0
            episode_reward = .0

            while not done:
                action = sess.run(outputs["output_actions"],
                                  feed_dict={
                                      ph_inputs["obs_ph"]: [np.asarray(ob)],
                                      ph_inputs["deterministic_ph"]: True
                                  })
                next_ob, reward, done, info = env.step(action[0])
                env.render()
                time.sleep(0.1)
                episode_reward += reward
                episode_len += 1
                ob = next_ob

            len_window.push(episode_len)
            reward_window.push(episode_reward)
            print(reward_window)
            print(len_window)
def main():
    with open(FLAGS.config, 'r') as ips:
        config = json.load(ips)
        print(config)

    env = gym.make("CartPole-v0")
    env.seed(0)

    agent_class = agents[config["agent"]["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        config["agent"],
                        config["model"],
                        distributed_spec={
                            "ps_hosts": FLAGS.ps_hosts,
                            "memory_hosts": FLAGS.memory_hosts,
                            "actor_hosts": FLAGS.actor_hosts,
                            "learner_hosts": FLAGS.learner_hosts,
                            "job_name": FLAGS.job_name,
                            "task_index": FLAGS.task_index
                        },
                        custom_model=MyESmodel,
                        checkpoint_dir=None)
    all_cost = time.time()
    if FLAGS.job_name == "ps":
        print("ps starts===>")
        agent.join()
    elif FLAGS.job_name == "memory":
        print("memory starts===>")
        while not agent.should_stop():
            agent.communicate()
            print("communicating")
            time.sleep(0.1)
    elif FLAGS.job_name == "actor":
        print("actor starts===>")
        reward_window = WindowStat("reward", 50)
        length_window = WindowStat("length", 50)

        perturbation_scale = 0.1
        run_episode_per_perturbation = config["agent"].get(
            "run_episode_per_perturbation", 1)

        seeds, rewards, perturbation_scales = list(), list(), list()
        is_positive = False
        returns = list()

        agent.sync_vars()

        episode_count = 0
        try:
            while not agent.should_stop():

                # do perturbation
                is_positive = False if is_positive else True

                # each seed will be used twice
                seed = np.random.randint(1000000) if is_positive else seeds[-1]
                perturbation_scale = max(
                    perturbation_scale * (1 - episode_count / 2000.0), 0.02)

                feed = agent.behavior_model.perturbation_feed
                fetch = [agent.behavior_model.reset_perturbation_op]

                agent.executor.run(fetches=fetch,
                                   feed_dict={
                                       feed['perturbation_seeds']: [seed],
                                       feed['perturbation_scales']:
                                       [perturbation_scale],
                                       feed['positive_perturbation']:
                                       is_positive
                                   })

                if is_positive:
                    seeds.append(seed)
                    perturbation_scales.append(perturbation_scale)

                rewards, episode_lens = rollout(
                    agent,
                    env,
                    episode_num=run_episode_per_perturbation,
                    use_perturbed_action=True)
                episode_count += run_episode_per_perturbation

                # calculate the average reward from a specific perturbation with one direction
                if len(returns) == 0:
                    returns.append([np.mean(rewards)])
                elif len(returns[-1]) < 2:
                    returns[-1].append(np.mean(rewards))
                else:
                    returns.append([np.mean(rewards)])

                if len(returns) == agent.config.get(
                        'sample_batch_size', 100) and len(returns[-1]) == 2:
                    # send out the results for the latest `sample_batch_size` * 2 trials
                    print(reward_window)
                    assert len(seeds) == len(returns)
                    assert len(perturbation_scales) == len(returns)

                    agent.send_experience(
                        **dict(perturbation_seeds=seeds,
                               perturbation_scales=perturbation_scales,
                               returns=returns))

                    # reset the direction
                    is_positive = False

                    # synchronize the weights from parameter server to local behavior_model
                    agent.sync_vars()

                    # do evaluation for 20 episode
                    evaluation_num = 20
                    evl_returns, _ = rollout(agent,
                                             env,
                                             episode_num=evaluation_num,
                                             use_perturbed_action=False)
                    print(
                        "evaluation at episode:", episode_count,
                        ",avg episode reward of {} evaluation:".format(
                            evaluation_num), np.mean(evl_returns))

                reward_window.push(rewards)
                length_window.push(episode_lens)
                if episode_count % 50 == 0:
                    print(reward_window)
                    print(length_window)
                    sys.stdout.flush()
        except tf.errors.OutOfRangeError as e:
            print("memory has stopped.")

    elif FLAGS.job_name == "learner":
        print("learner starts===>")
        train_count = 0
        try:
            while not agent.should_stop():
                batch_data = agent.receive_experience()
                if batch_data:
                    extra_data = agent.learn(batch_data)
                    train_count += 1
                    print("learning {}".format(extra_data))
                    sys.stdout.flush()
        except tf.errors.OutOfRangeError as e:
            print("memory has stopped.")
    else:
        raise ValueError("Invalid job_name.")
    all_cost = time.time() - all_cost
    print("done. all_cost:", all_cost)
示例#14
0
def main():
    env = MaxComponentEnv(num_arms=6)

    agent_class = agents[AGENT_CONFIG["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        AGENT_CONFIG,
                        MODEL_CONFIG,
                        distributed_spec={},
                        export_dir="hook_dump_dir")

    reward_window = WindowStat("reward", 50)
    length_window = WindowStat("length", 50)
    loss_window = WindowStat("loss", 50)
    obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
    ), list()
    act_count = 0

    for i in range(100):
        ob = env.reset()
        done = False
        episode_reward = .0
        episode_len = 0

        while not done:
            action, results = agent.act([ob],
                                        deterministic=False,
                                        use_perturbed_action=False)

            next_ob, reward, done, info = env.step(action[0])
            act_count += 1

            obs.append(ob)
            actions.append(action[0])
            rewards.append(reward)
            next_obs.append(next_ob)
            dones.append(done)
            if agent.ready_to_send:
                agent.send_experience(obs=obs,
                                      actions=actions,
                                      rewards=rewards,
                                      next_obs=next_obs,
                                      dones=dones)
            if agent.ready_to_receive:
                batch_data = agent.receive_experience()
                res = agent.learn(batch_data)
                loss_window.push(res['loss'])

                if AGENT_CONFIG.get("prioritized_replay", False):
                    agent.update_priorities(indexes=batch_data["indexes"],
                                            td_error=res["td_error"])

            ob = next_ob
            episode_reward += reward
            episode_len += 1
            if act_count % 5 == 0:
                print("timestep:", act_count, reward_window, length_window)

        agent.add_episode(1)
        reward_window.push(episode_reward)
        length_window.push(episode_len)

    agent.export_saved_model()
    print("Done.")
示例#15
0
    def doTestES(self):
        np.random.seed(0)
        env = gym.make("CartPole-v0")
        env.seed(0)
        es_g = tf.Graph()
        with es_g.as_default():
            tf.set_random_seed(123)
            agent = agents[ES_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                ES_AGENT_CONFIG,
                ES_MODEL_CONFIG,
                distributed_spec={},
                custom_model=MyESmodel)
        reward_window = WindowStat("reward", 25)

        perturbation_scale = 0.1

        seeds, rewards, perturbation_scales = list(), list(), list()
        is_positive_direction = list()

        episode_per_perturbation = 1

        returns = list()

        for i in range(5000):
            ob = env.reset()
            done = False
            episode_reward = .0

            if i % episode_per_perturbation == 0:
                # perturb parameters every `episode_per_seed` episodes
                is_positive = True if len(
                    is_positive_direction
                ) == 0 else is_positive_direction[-1] != True

                # each seed twice
                seed = np.random.randint(1000000) if is_positive else seeds[-1]
                perturbation_scale = max(perturbation_scale * (1 - i / 2000.0),
                                         0.02)

                feed = agent.model.perturbation_feed
                fetch = [agent.model.reset_perturbation_op]

                agent.executor.run(
                    fetches=fetch,
                    feed_dict={
                        feed['perturbation_seeds']: [seed],
                        feed['perturbation_scales']: [perturbation_scale],
                        feed['positive_perturbation']: is_positive
                    })

                if is_positive:
                    seeds.append(seed)
                    perturbation_scales.append(perturbation_scale)
                is_positive_direction.append(is_positive)

            while not done:
                action, result = agent.act(
                    [ob], True, use_perturbed_action=True)

                next_ob, reward, done, info = env.step(action[0])

                ob = next_ob
                episode_reward += reward

            rewards.append(episode_reward)
            reward_window.push(episode_reward)
            if len(rewards) == episode_per_perturbation:
                returns.append(np.mean(rewards))
                rewards = []
                if len(returns) == 2 * agent.config.get(
                        'sample_batch_size', 100):
                    print(reward_window)
                    assert len(seeds) == (len(returns) / 2)
                    assert len(perturbation_scales) == (len(returns) / 2)
                    agent.learn(
                        batch_data=dict(
                            perturbation_seeds=seeds,
                            perturbation_scales=perturbation_scales,
                            returns=np.reshape(returns, [-1, 2])))
                    seeds = []
                    perturbation_scales = []
                    returns = []
                    is_positive_direction = []

                    # evaluation 20 episodes
                    test_rewards = list()
                    for j in range(10):
                        done = False
                        ob = env.reset()
                        episode_reward = 0
                        while not done:
                            action, result = agent.act(
                                [ob], True, use_perturbed_action=False)

                            next_ob, reward, done, info = env.step(action[0])

                            ob = next_ob
                            episode_reward += reward
                        test_rewards.append(episode_reward)
                    print("[evaluation] average reward of 20 episodes:",
                          np.mean(test_rewards))
                    print('train at ', i)

        return np.mean(test_rewards)
示例#16
0
def main():
    env = gym.make("Pendulum-v0")
    env.seed(0)

    agent_class = agents[AGENT_CONFIG["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        AGENT_CONFIG,
                        MODEL_CONFIG,
                        distributed_spec={},
                        checkpoint_dir="ckpt_dir",
                        export_dir="dump_dir",
                        custom_model=MyDDPG)

    reward_window = WindowStat("reward", 50)
    length_window = WindowStat("length", 50)
    loss_window = WindowStat("loss", 50)
    actor_loss = WindowStat("actor_loss", 50)
    obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
    ), list()
    act_count = 0
    train_count = 0
    total_cost = time.time()
    for i in range(500):
        ob = env.reset()
        done = False
        episode_reward = .0
        episode_len = 0

        while not done:
            action, results = agent.act([ob],
                                        False,
                                        use_perturbed_action=False)
            act_count += 1
            next_ob, reward, done, info = env.step(action[0])
            obs.append(ob)
            actions.append(action[0])
            rewards.append(0.1 * reward)
            next_obs.append(next_ob)
            dones.append(done)
            if agent.ready_to_send:
                agent.send_experience(obs=obs,
                                      actions=actions,
                                      rewards=rewards,
                                      dones=dones,
                                      next_obs=next_obs)
            if agent.ready_to_receive:
                batch_data = agent.receive_experience()
                res = agent.learn(batch_data)
                loss_window.push(res["critic_loss"])
                actor_loss.push(res["actor_loss"])
                train_count += 1

                if AGENT_CONFIG.get("prioritized_replay", False):
                    agent.update_priorities(indexes=batch_data["indexes"],
                                            td_error=res["td_error"])

            ob = next_ob
            episode_reward += reward
            episode_len += 1
        agent.add_episode(1)
        reward_window.push(episode_reward)
        length_window.push(episode_len)
        if act_count % 200 == 0:
            print("timestep:", act_count, reward_window, loss_window,
                  actor_loss)

    agent.export_saved_model()
    print("Done.", "act_count:", act_count, "train_count:", train_count,
          "total_cost:",
          time.time() - total_cost)
示例#17
0
def main():
    with open(FLAGS.config, 'r') as ips:
        config = json.load(ips)
        print(config)

    env = gym.make("CartPole-v0")
    env.seed(0)

    agent_class = agents[config["agent"]["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        agent_config=config["agent"],
                        model_config=config["model"],
                        distributed_spec={
                            "ps_hosts": FLAGS.ps_hosts,
                            "worker_hosts": FLAGS.worker_hosts,
                            "job_name": FLAGS.job_name,
                            "task_index": FLAGS.task_index
                        },
                        custom_model=MyPGModel,
                        checkpoint_dir="")

    all_cost = time.time()
    if FLAGS.job_name == "ps":
        print("ps starts===>")
        agent.join()
    elif FLAGS.job_name == "worker":
        print("actor starts===>")
        act_count = 0
        train_count = 0
        reward_window = WindowStat("reward", 50)
        length_window = WindowStat("length", 50)
        obs, actions, rewards, dones, value_preds = list(), list(), list(
        ), list(), list()

        while not agent.should_stop():
            ob = env.reset()
            done = False
            episode_reward = 0.0
            episode_len = 0.0

            while not done and not agent.should_stop():
                action, results = agent.act([ob], False)

                act_count += 1

                new_ob, reward, done, info = env.step(action[0])

                obs.append(ob)
                actions.append(action[0])
                rewards.append(0.1 * reward)
                dones.append(done)
                value_preds.append(results["value_preds"][0])

                if agent.ready_to_send:
                    agent.send_experience(obs=obs,
                                          actions=actions,
                                          rewards=rewards,
                                          dones=dones,
                                          value_preds=value_preds)
                    batch_data = agent.receive_experience()
                    if batch_data:
                        extra_data = agent.learn(batch_data)
                        train_count += 1
                        print("learning {}".format(extra_data))

                ob = new_ob
                episode_reward += reward
                episode_len += 1
            print("act_count:", act_count)
            reward_window.push(episode_reward)
            length_window.push(episode_len)
            print(reward_window)
            print(length_window)
            sys.stdout.flush()

        if FLAGS.task_index == 0:
            agent.export_saved_model(export_dir="a3c_export_dir")
            print("export savedmodel finish")
    else:
        raise ValueError("Invalid job_name.")
    all_cost = time.time() - all_cost
    print("done. all_cost:", all_cost)
def main():
    with open(FLAGS.config, 'r') as ips:
        config = json.load(ips)
        print(config)

    env = gym.make("CartPole-v0")
    env.seed(0)

    agent_class = agents[config["agent"]["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        agent_config=config["agent"],
                        model_config=config["model"],
                        distributed_spec={
                            "ps_hosts": FLAGS.ps_hosts,
                            "memory_hosts": FLAGS.memory_hosts,
                            "actor_hosts": FLAGS.actor_hosts,
                            "learner_hosts": FLAGS.learner_hosts,
                            "job_name": FLAGS.job_name,
                            "task_index": FLAGS.task_index
                        },
                        custom_model=MyPPOModel,
                        checkpoint_dir=None)
    all_cost = time.time()
    if FLAGS.job_name == "ps":
        print("ps starts===>")
        agent.join()
    elif FLAGS.job_name == "memory":
        print("memory starts===>")
        while not agent.should_stop():
            agent.communicate()
            print(agent._receive_count, "actor2mem_q:",
                  agent._actor2mem_q.qsize(), "mem2learner_q:",
                  agent._mem2learner_q.qsize())
            sys.stdout.flush()
            time.sleep(0.1)
    elif FLAGS.job_name == "actor":
        print("actor starts===>")
        act_count = 0
        reward_window = WindowStat("reward", 50)
        length_window = WindowStat("length", 50)
        obs, actions, rewards, dones, value_preds, logits = list(), list(
        ), list(), list(), list(), list()
        agent.sync_vars()
        while not agent.should_stop():
            ob = env.reset()
            done = False
            episode_reward = .0
            episode_len = 0
            print("begin an episode.")
            sys.stdout.flush()
            while not done and not agent.should_stop():
                action, results = agent.act([ob], False)
                act_count += 1

                new_ob, reward, done, info = env.step(action[0])
                obs.append(ob)
                actions.append(action[0])
                rewards.append(0.1 * reward)
                dones.append(done)
                logits.append(results["logits"][0])
                value_preds.append(results["value_preds"][0])

                if agent.ready_to_send:
                    print("to send exp.")
                    sys.stdout.flush()
                    agent.send_experience(obs=obs,
                                          actions=actions,
                                          rewards=rewards,
                                          dones=dones,
                                          logits=logits,
                                          value_preds=value_preds)
                    agent.sync_vars()
                    print("sent")
                    sys.stdout.flush()
                ob = new_ob
                episode_reward += reward
                episode_len += 1
            print("act_count:", act_count)
            reward_window.push(episode_reward)
            length_window.push(episode_len)
            print(reward_window)
            print(length_window)
            sys.stdout.flush()

    elif FLAGS.job_name == "learner":
        print("learner starts===>")
        train_count = 0
        while not agent.should_stop():
            batch_data = agent.receive_experience()
            if batch_data:
                extra_data = agent.learn(batch_data)
                train_count += 1
                print("learning {}".format(extra_data))
                sys.stdout.flush()
    else:
        raise ValueError("Invalid job_name.")
    all_cost = time.time() - all_cost
    print("done. all_cost:", all_cost)
示例#19
0
    def doTestSavedModel(self):
        trial_timestamp = time.strftime("%Y%m%d-%H%M%S")
        model_dir = "model_dir_{}".format(trial_timestamp)
        os.system("mkdir {}".format(model_dir))

        np.random.seed(0)
        env = gym.make("CartPole-v0")
        env.seed(0)
        dqn_g = tf.Graph()
        with dqn_g.as_default():
            tf.set_random_seed(123)
            agent = agents[DQN_AGENT_CONFIG["type"]](env.observation_space,
                                                     env.action_space,
                                                     DQN_AGENT_CONFIG,
                                                     DQN_MODEL_CONFIG,
                                                     export_dir=model_dir,
                                                     distributed_spec={})
        reward_window = WindowStat("reward", 50)
        obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
        ), list()
        act_count = 0

        for i in range(500):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob],
                                            deterministic=False,
                                            use_perturbed_action=False)

                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                obs.append(ob)
                actions.append(action[0])
                rewards.append(reward)
                next_obs.append(next_ob)
                dones.append(done)
                if agent.ready_to_send:
                    agent.send_experience(obs=obs,
                                          actions=actions,
                                          rewards=rewards,
                                          next_obs=next_obs,
                                          dones=dones)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                    if DQN_AGENT_CONFIG.get("prioritized_replay", False):
                        agent.update_priorities(indexes=batch_data["indexes"],
                                                td_error=res["td_error"])

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            agent.add_episode(1)
            reward_window.push(episode_reward)

        prev_perf = reward_window.stats()["reward_mean"]
        print("Performance before saving is {}".format(prev_perf))

        with tf.Session() as sess:
            path = model_dir
            MetaGraphDef = tf.saved_model.loader.load(
                sess, tags=[sm.tag_constants.SERVING], export_dir=path)

            # get SignatureDef protobuf
            SignatureDef_d = MetaGraphDef.signature_def
            SignatureDef = SignatureDef_d["predict_results"]

            # get inputs/outputs TensorInfo protobuf
            ph_inputs = {}
            for name, ts_info in SignatureDef.inputs.items():
                ph_inputs[name] = sm.utils.get_tensor_from_tensor_info(
                    ts_info, sess.graph)

            outputs = {}
            for name, ts_info in SignatureDef.outputs.items():
                outputs[name] = sm.utils.get_tensor_from_tensor_info(
                    ts_info, sess.graph)

            for name, ph in ph_inputs.items():
                print(name, ph)

            for name, ts in outputs.items():
                print(name, ts)

            reward_window = WindowStat("reward", 10)
            for i in range(10):
                ob = env.reset()
                done = False
                episode_reward = .0

                while not done:
                    action = sess.run(outputs["output_actions"],
                                      feed_dict={
                                          ph_inputs["obs_ph"]:
                                          [np.asarray(ob)],
                                          ph_inputs["deterministic_ph"]: True
                                      })
                    next_ob, reward, done, info = env.step(action[0])
                    episode_reward += reward
                    ob = next_ob

                reward_window.push(episode_reward)

        cur_perf = reward_window.stats()["reward_mean"]
        print("Performance after restore is {}".format(cur_perf))
        return prev_perf - cur_perf