def run_test(agent, num_exp=100): # set up environment env = envs.make(args.env) all_total_reward = [] # run experiment for ep in range(num_exp): env.set_random_seed(100000000 + ep) env.reset() total_reward = 0 state = env.observe() done = False while not done: act = agent.get_action(state) state, reward, done = env.step(act) total_reward += reward all_total_reward.append(total_reward) return all_total_reward
def __init__(self, **kwargs): self.config = kwargs #self.env = environments.make('ContinuousThor-v0', goals = ['laptop'], scenes = list(range(201, 230))) self.env = environments.make('House-v0', scene='00cfe094634578865b4384f3adef49e6', goals=['kitchen' ]) #, goals = ['living_room']) self.obs = self.env.reset()
def create_envs(num_training_processes, env_kwargs): def wrap(env): env = RewardCollector(env) env = TransposeImage(env) env = ScaledFloatFrame(env) env = UnrealEnvBaseWrapper(env) return env thunk = lambda: wrap(environments.make(**env_kwargs)) env = SubprocVecEnv([thunk for _ in range(num_training_processes)]) return env, None
def create_envs(num_training_processes, tasks, **env_kwargs): def wrap(env): env = RewardCollector(env) env = TransposeImage(env) env = ScaledFloatFrame(env) env = UnrealEnvBaseWrapper(env) return env env_fns = [lambda: wrap(environments.make(graph_name = scene, goals = goal, **env_kwargs)) for (scene, goals) in tasks for goal in goals] env = SubprocVecEnv(env_fns) env.set_hardness = lambda hardness: env.call_unwrapped('set_complexity', hardness) env.set_hardness(0.1) #env.set_hardness(1.0) return env
def __init__(self, config): """ Initializes class with the configuration. """ self._config = config self._is_chef = config.is_chef # create a new environment self._env = make("PandaGrasp", config) ob_space = self._env.observation_space # e.g. OrderedDict([('object-state', [10]), ('robot-state', [36])]) ac_space = self._env.action_space # e.g. ActionSpace(shape=OrderedDict([('default', 8)]),minimum=-1.0, maximum=1.0) print('***', ac_space) # get actor and critic networks actor, critic = MlpActor, MlpCritic # build up networks for PPO agent if self._config.algo == 'sac': self._agent = SACAgent(config, ob_space, ac_space, actor, critic) else: self._agent = PPOAgent(config, ob_space, ac_space, actor, critic) # build rollout runner self._runner = RolloutRunner(config, self._env, self._agent) # setup log if self._is_chef and self._config.is_train: exclude = ['device'] if not self._config.wandb: os.environ['WANDB_MODE'] = 'dryrun' # Weights and Biases (wandb) is used for logging, set the account details below or dry run above # user or team name entity = 'panda' # project name project = 'robo' wandb.init(resume=config.run_name, project=project, config={ k: v for k, v in config.__dict__.items() if k not in exclude }, dir=config.log_dir, entity=entity, notes=config.notes)
def main(args): BATCH_SIZE = args.batch_size MAX_EPSILON = args.max_epsilon MIN_EPSILON = args.min_epsilon decay = args.decay gamma = args.gamma env_name = args.env_name if env_name in ['MountainCar-v0']: env = gym.make(env_name) num_states = env.env.observation_space.shape[0] num_actions = env.env.action_space.n else: env = environments.make(env_name) num_states = env.get_num_states() num_actions = env.get_num_actions() model = fully_connected.Model(num_states, num_actions, BATCH_SIZE, layer_sizes=[10,10]) mem = helpers.Memory(1000) config = tf.ConfigProto( device_count = {'GPU': 0} ) saver = tf.train.Saver() model_save_dir = os.path.join('.', 'saved_models', datetime.datetime.now().strftime('%Y%m%d_%H%M%S')) os.makedirs(model_save_dir, exist_ok=True) with tf.Session(config=config) as sess: sess.run(model.var_init) gr = helpers.GameRunner(sess, model, env, mem, MAX_EPSILON, MIN_EPSILON, decay, gamma) num_episodes = 300 cnt = 0 while cnt < num_episodes: if cnt % 50 == 0: print('Episode {} of {}'.format(cnt+1, num_episodes)) gr._render = True gr.run() save_path = saver.save(sess, os.path.join(model_save_dir,"model_{:05d}.ckpt".format(cnt))) print("Model saved in path: %s" % save_path) else: gr._render = True gr.run() cnt += 1
def record_videos(agent, path, screen_size): seed = 1 for scene, tasks in EXPERIMENTS: env = environments.make('GoalHouse-v1', screen_size=screen_size, scene=scene, goals=None) env = RenderVideoWrapper(env, path) env = agent.wrap_env(env) env.seed(seed) for task in tasks: agent.reset_state() if task is not None: env.unwrapped.set_next_task(task) obs = env.reset() if task is None: print(env.unwrapped.state) done = False while not done: obs, _, done, _ = env.step(agent.act(obs))
import environments if __name__ == '__main__': #from graph.util import load_graph #env = environments.make('AuxiliaryGraph-v0', graph_file = '/home/jonas/.visual_navigation/scenes/thor-cached-225.pkl') #env.unwrapped.browse().show() # env = environments.make('CachedThor-v0', goals = [], h5_file_path = 'test.h5') #, goals = [], scenes = 311) # env.unwrapped.browse().show() env = environments.make('AuxiliaryThor-v1', goals=[], scenes=311, enable_noise=True) env.unwrapped.browse().show()
'A_VisitsDecay_3c6267913c894118a9e60bc796652cc7', 'A_UCB_d9546ca2809d43e4b8b5be4a5d5c33ac', 'A_UCB_341235ec2ac7476981f9be3281506c7a', 'A_DecayRate_2923aed748004df4819f142b64433e8e', 'A_AlwaysGreedy_3cab8bb2ecd94981bd57c46af642e9f8' ] if value in exceps: return False else: return True OTHERS = list( filter(exceptions, [a for a in list_saved_agents(filter='unique')])) AGENTS = [a for a in TOP] env = environments.make('hitstand') def get_features(route): def translate_null(value, lr=False): if value: return value else: if lr: return '1/visits' else: return 'N/A' def rename(feats_dict):
def training_agent(agent_id, params_queue, reward_queue, adv_queue, gradient_queue): np.random.seed(args.seed) # for environment tf.set_random_seed(agent_id) # for model evolving sess = tf.Session() # set up actor agent for training actor_agent = ActorAgent(sess) critic_agent = CriticAgent(sess, input_dim=args.num_workers + 2) # set up envrionemnt env = envs.make(args.env) # collect experiences while True: # get parameters from master (actor_params, critic_params, entropy_weight, model_idx) = \ params_queue.get() # synchronize model parameters actor_agent.set_params(actor_params) critic_agent.set_params(critic_params) # reset environment env.set_random_seed(model_idx) env.reset() # set up training storage batch_inputs, batch_act_vec, batch_wall_time, batch_reward = \ [], [], [], [] # run experiment state = env.observe() done = False while not done: # decompose state (for storing infomation) workers, job, curr_time = state inputs = np.zeros([1, args.num_workers + 1]) for worker in workers: inputs[0, worker.worker_id] = \ min(sum(j.size for j in worker.queue) / \ args.job_size_norm_factor / 5.0, # normalization 20.0) inputs[0, -1] = min(job.size / \ args.job_size_norm_factor, 10.0) # normalization # draw an action action = actor_agent.predict(inputs)[0] # store input and action batch_inputs.append(inputs) act_vec = np.zeros([1, args.num_workers]) act_vec[0, action] = 1 batch_act_vec.append(act_vec) # store wall time batch_wall_time.append(curr_time) # interact with environment state, reward, done = env.step(action) # scale reward for training reward /= args.reward_scale # store reward batch_reward.append(reward) # store final time batch_wall_time.append(env.wall_time.curr_time) # compute all values value_inputs = np.zeros([len(batch_inputs), args.num_workers + 2]) for i in range(len(batch_inputs)): value_inputs[i, :-1] = batch_inputs[i] value_inputs[i, -1] = batch_wall_time[i] / float(batch_wall_time[-1]) batch_values = critic_agent.predict(value_inputs) # summarize more info for master agent unfinished_jobs = sum(len(worker.queue) for worker in env.workers) unfinished_jobs += sum(worker.curr_job is not None for worker in env.workers) finished_work = sum(j.size for j in env.finished_jobs) unfinished_work = 0 for worker in env.workers: for j in worker.queue: unfinished_work += j.size if worker.curr_job is not None: unfinished_work += worker.curr_job.size average_job_duration = np.mean( [j.finish_time - j.arrival_time for j in env.finished_jobs]) # report rewards to master agent reward_queue.put([ batch_reward, np.array(batch_values), batch_wall_time, len(env.finished_jobs), unfinished_jobs, finished_work, unfinished_work, average_job_duration ]) # get advantage term batch_adv, batch_actual_value = adv_queue.get() # conpute gradient actor_gradient, loss = actor_agent.compute_gradients( batch_inputs, batch_act_vec, batch_adv, entropy_weight) critic_gradient, _ = critic_agent.compute_gradients( value_inputs, batch_actual_value) # send back gradients gradient_queue.put([actor_gradient, critic_gradient, loss]) sess.close()
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) # create result and model folder create_folder_if_not_exists(args.result_folder) # different agents for different environments if args.env == 'load_balance': schemes = ['shortest_processing_time', 'learn'] else: schemes = ['learn'] # tensorflow session sess = tf.Session() # store results all_performance = {scheme: [] for scheme in schemes} # create environment env = envs.make(args.env) # initialize all agents agents = {} for scheme in schemes: if scheme == 'learn': agents[scheme] = ActorAgent(sess) # saver for loading trained model saver = tf.train.Saver(max_to_keep=args.num_saved_models) # initialize parameters sess.run(tf.global_variables_initializer()) # load trained model if args.saved_model is not None: saver.restore(sess, args.saved_model) elif scheme == 'leat_work': agents[scheme] = LeastWorkAgent() elif scheme == 'shortest_processing_time': agents[scheme] = ShortestProcessingTimeAgent() else: print('invalid scheme', scheme) exit(1) # store results all_performance = {} # plot job duration cdf fig = plt.figure() title = 'average: ' for scheme in schemes: all_total_reward = run_test(agents[scheme], num_exp=args.num_ep) all_performance[scheme] = all_total_reward x, y = compute_CDF(all_total_reward) plt.plot(x, y) title += ' ' + scheme + ' ' title += '%.2f' % np.mean(all_total_reward) plt.xlabel('Total reward') plt.ylabel('CDF') plt.title(title) plt.legend(schemes) fig.savefig(args.result_folder + \ args.env + '_all_performance.png') plt.close(fig) # save all job durations np.save(args.result_folder + \ args.env + '_all_performance.npy', \ all_performance) sess.close()
argparse.ArgumentParser(description="") parser = argparse.ArgumentParser( description='Deep reactive agent scene explorer.') parser.add_argument('--h5_file_path', type=str, default='/app/data/{scene}.h5') parser.add_argument('--unity_path', type=str) parser.add_argument('--scene', help='Scene to run the explorer on', default='bedroom_04', type=str) args = vars(parser.parse_args()) from experiments.data import TRAIN, VALIDATION env = environments.make( 'GoalHouse-v1', screen_size=(500, 500), scene=['0b6d4fe900eaddd80aecf4bc79248dd9'] ) #['b814705bc93d428507a516b866efda28','e3ae3f7b32cf99b29d3c8681ec3be321','5f3f959c7b3e6f091898caa8e828f110']) #from environments.gym_house.video import RenderVideoWrapper #env = RenderVideoWrapper(env, '') ''' 208, 212 ''' #env = environments.make('AuxiliaryGraph-v0', goals = (5, 6, 2), graph_name = 'thor-cached-225') # graph_file = 'kitchen.pkl') GoalKeyboardAgent(env).show()
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) # create result and model folder create_folder_if_not_exists(args.result_folder) # different agents for different environments if args.env == 'load_balance': schemes = ['shortest_processing_time', 'learn'] else: print 'Schemes for ' + args.env + ' does not exist' exit(1) # tensorflow session sess = tf.Session() # store results all_performance = {scheme: [] for scheme in schemes} # create environment env = envs.make(args.env) # initialize all agents agents = {} for scheme in schemes: if scheme == 'learn': agents[scheme] = ActorAgent(sess) # saver for loading trained model saver = tf.train.Saver(max_to_keep=args.num_saved_models) # initialize parameters sess.run(tf.global_variables_initializer()) # load trained model if args.saved_model is not None: saver.restore(sess, args.saved_model) elif scheme == 'leat_work': agents[scheme] = LeastWorkAgent() elif scheme == 'shortest_processing_time': agents[scheme] = ShortestProcessingTimeAgent() else: print 'invalid scheme', scheme exit(1) # run testing experiments for ep in xrange(args.num_ep): for scheme in schemes: # reset the environment with controlled seed env.set_random_seed(ep) env.reset() # pick agent agent = agents[scheme] # store total reward total_reward = 0 # -- run the environment -- t1 = time.time() state = env.observe() done = False while not done: action = agent.get_action(state) state, reward, done = env.step(action) total_reward += reward t2 = time.time() print 'Elapsed', scheme, t2 - t1, 'seconds' all_performance[scheme].append(total_reward) # plot job duration cdf fig = plt.figure() title = 'average: ' for scheme in schemes: x, y = compute_CDF(all_performance[scheme]) plt.plot(x, y) title += ' ' + scheme + ' ' title += '%.2f' % np.mean(all_performance[scheme]) plt.xlabel('Total reward') plt.ylabel('CDF') plt.title(title) plt.legend(schemes) fig.savefig(args.result_folder + \ args.env + '_all_performance.png') plt.close(fig) # save all job durations np.save(args.result_folder + \ args.env + '_all_performance.npy', \ all_performance) sess.close()