Пример #1
0
        def make_atari_env():
            env = make_atari("PongNoFrameskip-v4")
            env = wrap_deepmind(env=env,
                                frame_stack=True,
                                clip_rewards=False,
                                episode_life=True,
                                wrap_frame=True,
                                frame_resize=42)

            return env
Пример #2
0
def main():
    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(env=env,
                        frame_stack=True,
                        clip_rewards=False,
                        episode_life=True,
                        wrap_frame=True,
                        frame_resize=42)

    agent_class = agents[AGENT_CONFIG["type"]]
    agent = agent_class(env.observation_space,
                        env.action_space,
                        AGENT_CONFIG,
                        MODEL_CONFIG,
                        distributed_spec={},
                        custom_model=MyDQNModel)

    reward_window = WindowStat("reward", 10)
    length_window = WindowStat("length", 10)
    loss_window = WindowStat("loss", 10)
    obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
    ), list()

    for i in range(2000):
        ob = env.reset()
        ob = np.asarray(ob)
        done = False
        episode_reward = .0
        episode_len = 0

        while not done:
            action, results = agent.act([ob],
                                        deterministic=False,
                                        use_perturbed_action=False)

            next_ob, reward, done, info = env.step(action[0])
            next_ob = np.asarray(next_ob)

            obs.append(ob)
            actions.append(action[0])
            rewards.append(reward)
            next_obs.append(next_ob)
            dones.append(done)
            if agent.ready_to_send:
                agent.send_experience(obs=obs,
                                      actions=actions,
                                      rewards=rewards,
                                      next_obs=next_obs,
                                      dones=dones)
            if agent.ready_to_receive:
                batch_data = agent.receive_experience()
                res = agent.learn(batch_data)
                loss_window.push(res['loss'])

                if AGENT_CONFIG.get("prioritized_replay", False):
                    agent.update_priorities(indexes=batch_data["indexes"],
                                            td_error=res["td_error"])

            ob = next_ob
            episode_reward += reward
            episode_len += 1
        agent.add_episode(1)
        reward_window.push(episode_reward)
        length_window.push(episode_len)
        if i % 10 == 0:
            print('episode at', i)
            print(reward_window)
            print(length_window)
            print(loss_window)

    print("Done.")
def main():
    with open(FLAGS.config, 'r') as ips:
        config = json.load(ips)
        print(config)

    job_name = FLAGS.job_name

    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(
        env=env,
        frame_stack=True,
        clip_rewards=False,
        episode_life=True,
        wrap_frame=True,
        frame_resize=42)

    agent_class = agents[config["agent"]["type"]]
    agent = agent_class(
        env.observation_space,
        env.action_space,
        config["agent"],
        config["model"],
        distributed_spec={
            "ps_hosts": FLAGS.ps_hosts,
            "memory_hosts": FLAGS.memory_hosts,
            "actor_hosts": FLAGS.actor_hosts,
            "learner_hosts": FLAGS.learner_hosts,
            "job_name": FLAGS.job_name,
            "task_index": FLAGS.task_index
        },
        custom_model=MyVTmodel,
        checkpoint_dir=None)
    all_cost = time.time()
    if job_name == "ps":
        print("ps starts===>")
        agent.join()
    elif job_name == "memory":
        start_tt = time.time()
        log_count = 0
        print("memory starts===>")
        while not agent.should_stop():
            agent.communicate()
            if time.time() - start_tt > log_count:
                log_count += 1
                print(agent._receive_count, "actor2mem_q:",
                      agent._actor2mem_q.qsize(), "mem2learner_2:",
                      agent._mem2learner_q.qsize())
                sys.stdout.flush()
    elif job_name == "actor":
        print("actor starts===>")
        start_tt = time.time()
        log_count = 0
        act_log_count = 0

        # create vectorized env
        def make_env(rank):
            def make_atari_env():
                env = make_atari("PongNoFrameskip-v4")
                env = wrap_deepmind(
                    env=env,
                    frame_stack=True,
                    clip_rewards=False,
                    episode_life=True,
                    wrap_frame=True,
                    frame_resize=42)
                env.seed(rank)
                return env

            return make_atari_env

        num_env = config["agent"].get("num_env", 1)
        vec_env = VectorizedEnvironment(
            make_env=make_env, num_env=num_env, seed=100 * FLAGS.task_index)

        act_count = 0
        reward_window = WindowStat("reward", 10)
        length_window = WindowStat("length", 10)
        obs, actions, rewards, dones, logits = list(), list(), list(), list(
        ), list()
        agent.sync_vars()

        while not agent.should_stop():
            ob = vec_env.reset()
            episode_reward = np.zeros(num_env, )
            episode_len = np.zeros(num_env, )

            while not agent.should_stop():
                action, results = agent.act(ob, False)
                act_count += 1

                new_ob, reward, done, info = vec_env.step(action)

                obs.append(ob)
                actions.append(action)

                rewards.append(reward)
                dones.append(done)
                logits.append(results["logits"])

                if agent.ready_to_send:

                    agent.send_experience(
                        obs=obs,
                        actions=actions,
                        rewards=rewards,
                        dones=dones,
                        logits=logits,
                        vec_env=True,
                        num_env=num_env)
                    agent.sync_vars()

                ob = new_ob

                episode_reward += np.asarray(reward)
                episode_len += 1
                for i in range(num_env):
                    if done[i]:
                        reward_window.push(episode_reward[i])
                        length_window.push(episode_len[i])
                        episode_reward[i] = .0
                        episode_len[i] = 0
                total_cost = time.time() - start_tt
                if int(total_cost / 5) > log_count:
                    log_count += 1
                    print("act_count:", act_count, "actor2mem_q:",
                          agent._actor2mem_q.qsize(), "total:", total_cost)
                    print('total_cost:', total_cost, reward_window)
                    print(length_window)
                    sys.stdout.flush()
                if int((act_count * num_env) / 10000) > act_log_count:
                    act_log_count += 1
                    print('timestep:', act_log_count * 10000, reward_window)

    elif job_name == "learner":
        print("learner starts===>")
        start_tt = time.time()
        train_count = 0
        try:
            while not agent.should_stop():
                batch_data = agent.receive_experience()
                if batch_data:

                    extra_data = agent.learn(batch_data)
                    train_count += 1
                    print("learning {}".format(extra_data), "receive_q:",
                          agent._receive_q.qsize())
                    print("train_count:", train_count, "total:",
                          time.time() - start_tt)
                    sys.stdout.flush()

        except tf.errors.OutOfRangeError as e:
            print("memory has stopped.")
    else:
        raise ValueError("Invalid job_name.")
    all_cost = time.time() - all_cost
    print("done. all_cost:", all_cost)
Пример #4
0
def main():
    gym_env = gym.make("CartPole-v0")

    atari_env = make_atari("PongNoFrameskip-v4")
    atari_env = wrap_deepmind(env=atari_env,
                              frame_stack=True,
                              clip_rewards=False,
                              episode_life=True,
                              wrap_frame=True,
                              frame_resize=42)

    # replace the following env according to your saved_model
    # env = atari_env
    env = gym_env

    with tf.Session() as sess:
        path = 'dump_dir'
        MetaGraphDef = tf.saved_model.loader.load(
            sess, tags=[sm.tag_constants.SERVING], export_dir=path)

        # get SignatureDef protobuf
        SignatureDef_d = MetaGraphDef.signature_def
        SignatureDef = SignatureDef_d["predict_results"]

        # get inputs/outputs TensorInfo protobuf
        ph_inputs = {}
        for name, ts_info in SignatureDef.inputs.items():
            ph_inputs[name] = sm.utils.get_tensor_from_tensor_info(
                ts_info, sess.graph)

        outputs = {}
        for name, ts_info in SignatureDef.outputs.items():
            outputs[name] = sm.utils.get_tensor_from_tensor_info(
                ts_info, sess.graph)

        for name, ph in ph_inputs.items():
            print(name, ph)

        for name, ts in outputs.items():
            print(name, ts)

        len_window = WindowStat("length", 50)
        reward_window = WindowStat("reward", 50)
        for i in range(100):
            ob = env.reset()
            env.render()
            time.sleep(0.2)
            done = False
            episode_len = 0
            episode_reward = .0

            while not done:
                action = sess.run(outputs["output_actions"],
                                  feed_dict={
                                      ph_inputs["obs_ph"]: [np.asarray(ob)],
                                      ph_inputs["deterministic_ph"]: True
                                  })
                next_ob, reward, done, info = env.step(action[0])
                env.render()
                time.sleep(0.1)
                episode_reward += reward
                episode_len += 1
                ob = next_ob

            len_window.push(episode_len)
            reward_window.push(episode_reward)
            print(reward_window)
            print(len_window)