def collect_rollouts(step, env_ctor, duration, num_agents, agent_config, isolate_envs): batch_env = define_batch_env(env_ctor, num_agents, isolate_envs) agent = mpc_agent.MPCAgent(batch_env, step, False, False, agent_config) def simulate_fn(unused_last, step): done, score, unused_summary = simulate_step(batch_env, agent, log=False, reset=tf.equal(step, 0)) with tf.control_dependencies([done, score]): image = batch_env.observ batch_action = batch_env.action batch_reward = batch_env.reward return done, score, image, batch_action, batch_reward initializer = (tf.zeros([num_agents], tf.bool), tf.zeros([num_agents], tf.float32), 0 * batch_env.observ, 0 * batch_env.action, tf.zeros([num_agents], tf.float32)) done, score, image, action, reward = tf.scan(simulate_fn, tf.range(duration), initializer, parallel_iterations=1) score = tf.boolean_mask(score, done) image = tf.transpose(image, [1, 0, 2, 3, 4]) action = tf.transpose(action, [1, 0, 2]) reward = tf.transpose(reward) return score, image, action, reward
def collect_rollouts(step, env_ctor, duration, num_agents, agent_config, isolate_envs): batch_env = define_batch_env(env_ctor, num_agents, isolate_envs) agent = mpc_agent.MPCAgent(batch_env, step, False, False, agent_config) cleanup = lambda: batch_env.close() def simulate_fn(unused_last, step): done, score, unused_summary = simulate_step(batch_env, agent, log=False, reset=tf.equal(step, 0)) with tf.control_dependencies([done, score]): image = batch_env.observ batch_action = batch_env.action batch_reward = batch_env.reward return done, score, image, batch_action, batch_reward initializer = (tf.zeros([num_agents], tf.bool), tf.zeros([num_agents], tf.float32), 0 * batch_env.observ, 0 * batch_env.action, tf.zeros([num_agents], tf.float32)) done, score, image, action, reward = tf.scan(simulate_fn, tf.range(duration), initializer, parallel_iterations=1) score = tf.boolean_mask(score, done) image = tf.transpose(image, [1, 0, 2, 3, 4]) action = tf.transpose(action, [1, 0, 2]) reward = tf.transpose(reward) # # Save the current score, image, action and reward in the datastructure for data collection. # global score_list # score_list.append(score) # global image_list # image_list.append(image) # global action_list # action_list.append(action) # global reward_list # reward_list.append(reward) # print("===========") # print("Reward shape: ", reward) # print("Reward: ", reward) # print("Reward[1]: ", reward[1]) # # with open('reward_list', 'wb') as fp: # # pickle.dump(reward_list, fp) # with open("/home/pulver/Desktop/reward_list.csv", "wb") as f: # writer = csv.writer(f) # writer.writerows(reward) return score, image, action, reward, cleanup
def collect_rollouts(step, env_ctor, duration, num_agents, agent_config, isolate_envs): batch_env = define_batch_env(env_ctor, num_agents, isolate_envs) agent = mpc_agent.MPCAgent(batch_env, step, False, False, agent_config) cleanup = lambda: batch_env.close() #print('----INSIDE COLL ROLL-----') def simulate_fn(unused_last, step): done, score, unused_summary = simulate_step(batch_env, agent, log=False, reset=tf.equal(step, 0)) with tf.control_dependencies([done, score]): image = batch_env.observ batch_action = batch_env.action batch_reward = batch_env.reward return done, score, image, batch_action, batch_reward initializer = (tf.zeros([num_agents], tf.bool), tf.zeros([num_agents], tf.float32), 0 * batch_env.observ, 0 * batch_env.action, tf.zeros([num_agents], tf.float32)) done, score, image, action, reward = tf.scan(simulate_fn, tf.range(duration), initializer, parallel_iterations=1) #print(batch_env._get_filename()) #print('--IMGSHAPE',image.shape) #print('--REWSHAPE',reward.shape) #print('--DONESHAPE',done.shape) #print('--ACSHAPE',action.shape) #print('--SCRSHAPE',score.shape) #assert 1==2 score = tf.boolean_mask(score, done) image = tf.transpose(image, [1, 0, 2, 3, 4]) action = tf.transpose(action, [1, 0, 2]) reward = tf.transpose(reward) #print('----EXIT COLL ROLL-----') return score, image, action, reward, cleanup