示例#1
0
def collect_rollouts(step, env_ctor, duration, num_agents, agent_config,
                     isolate_envs):
    batch_env = define_batch_env(env_ctor, num_agents, isolate_envs)
    agent = mpc_agent.MPCAgent(batch_env, step, False, False, agent_config)
    cleanup = lambda: batch_env.close()

    def simulate_fn(unused_last, step):
        done, score, unused_summary = simulate_step(batch_env,
                                                    agent,
                                                    log=False,
                                                    reset=tf.equal(step, 0))
        with tf.control_dependencies([done, score]):
            image = batch_env.observ
            batch_action = batch_env.action
            batch_reward = batch_env.reward
        return done, score, image, batch_action, batch_reward

    initializer = (tf.zeros([num_agents], tf.bool),
                   tf.zeros([num_agents], tf.float32), 0 * batch_env.observ,
                   0 * batch_env.action, tf.zeros([num_agents], tf.float32))
    done, score, image, action, reward = tf.scan(simulate_fn,
                                                 tf.range(duration),
                                                 initializer,
                                                 parallel_iterations=1)
    score = tf.boolean_mask(score, done)
    image = tf.transpose(image, [1, 0, 2, 3, 4])
    action = tf.transpose(action, [1, 0, 2])
    reward = tf.transpose(reward)
    return score, image, action, reward, cleanup
示例#2
0
def collect_rollouts(step, env_ctor, duration, num_agents, agent_config,
                     isolate_envs):
    batch_env = define_batch_env(env_ctor, num_agents, isolate_envs)
    agent = mpc_agent.MPCAgent(batch_env, step, False, False, agent_config)
    cleanup = lambda: batch_env.close()

    def simulate_fn(unused_last, step):
        done, score, unused_summary = simulate_step(batch_env,
                                                    agent,
                                                    log=False,
                                                    reset=tf.equal(step, 0))
        with tf.control_dependencies([done, score]):
            image = batch_env.observ
            batch_action = batch_env.action
            batch_reward = batch_env.reward
        return done, score, image, batch_action, batch_reward

    initializer = (tf.zeros([num_agents], tf.bool),
                   tf.zeros([num_agents], tf.float32), 0 * batch_env.observ,
                   0 * batch_env.action, tf.zeros([num_agents], tf.float32))
    done, score, image, action, reward = tf.scan(simulate_fn,
                                                 tf.range(duration),
                                                 initializer,
                                                 parallel_iterations=1)
    score = tf.boolean_mask(score, done)
    image = tf.transpose(image, [1, 0, 2, 3, 4])
    action = tf.transpose(action, [1, 0, 2])
    reward = tf.transpose(reward)

    # # Save the current score, image, action and reward in the datastructure for data collection.
    # global score_list
    # score_list.append(score)
    # global image_list
    # image_list.append(image)
    # global action_list
    # action_list.append(action)
    # global reward_list
    # reward_list.append(reward)
    # print("===========")
    # print("Reward shape: ", reward)
    # print("Reward: ", reward)
    # print("Reward[1]: ", reward[1])
    # # with open('reward_list', 'wb') as fp:
    # #     pickle.dump(reward_list, fp)
    # with open("/home/pulver/Desktop/reward_list.csv", "wb") as f:
    #     writer = csv.writer(f)
    #     writer.writerows(reward)

    return score, image, action, reward, cleanup
示例#3
0
def collect_rollouts(step, env_ctor, duration, num_agents, agent_config,
                     isolate_envs):
    batch_env = define_batch_env(env_ctor, num_agents, isolate_envs)
    agent = mpc_agent.MPCAgent(batch_env, step, False, False, agent_config)
    cleanup = lambda: batch_env.close()

    #print('----INSIDE COLL ROLL-----')

    def simulate_fn(unused_last, step):
        done, score, unused_summary = simulate_step(batch_env,
                                                    agent,
                                                    log=False,
                                                    reset=tf.equal(step, 0))
        with tf.control_dependencies([done, score]):
            image = batch_env.observ
            batch_action = batch_env.action
            batch_reward = batch_env.reward
        return done, score, image, batch_action, batch_reward

    initializer = (tf.zeros([num_agents], tf.bool),
                   tf.zeros([num_agents], tf.float32), 0 * batch_env.observ,
                   0 * batch_env.action, tf.zeros([num_agents], tf.float32))
    done, score, image, action, reward = tf.scan(simulate_fn,
                                                 tf.range(duration),
                                                 initializer,
                                                 parallel_iterations=1)
    #print(batch_env._get_filename())
    #print('--IMGSHAPE',image.shape)
    #print('--REWSHAPE',reward.shape)
    #print('--DONESHAPE',done.shape)
    #print('--ACSHAPE',action.shape)
    #print('--SCRSHAPE',score.shape)
    #assert 1==2
    score = tf.boolean_mask(score, done)
    image = tf.transpose(image, [1, 0, 2, 3, 4])
    action = tf.transpose(action, [1, 0, 2])
    reward = tf.transpose(reward)
    #print('----EXIT COLL ROLL-----')
    return score, image, action, reward, cleanup