def run(self): assert callable(self.env_maker) env = self.env_maker() # setup policy self.policy_type = self.policy_type.lower() if self.policy_type == 'stochastic': if discrete_action(env.action_space): self.policy = SoftmaxPolicy() elif continuous_action(env.action_space): self.policy = GaussianPolicy(low=env.action_space.low, high=env.action_space.high) else: raise TypeError('Type of action_space not valid') elif self.policy_type == 'greedy': if not discrete_action(env.action_space): raise TypeError('greedy policy supports only discrete action.') self.policy = EpsGreedyPolicy(self.policy_eps) else: raise ValueError('policy type {} invalid.'.format(self.policy_type)) # load model saved_model = self.do_load_model() net = self.net_cls() net.set_model(saved_model) # global_variables_initializer will re-initialize net.weights # and so we need to sync to saved_weights saved_weights = saved_model.get_weights() sess = tf.Session() net.set_session(sess) sess.run(tf.global_variables_initializer()) net.set_sync_weights(saved_weights) net.sync() # evaluation all_total_rewards = [] for _ in range(self.num_episodes): state = env.reset() self.render_env_at_timestep(env) total_rewards = 0.0 while True: state = self.state_to_input(state) action_values = net.action_values([state])[0] action = self.policy.select_action(action_values) print('action:', action) state, reward, done, info = env.step(action) self.render_env_at_timestep(env) total_rewards += reward if done: break if self.render_end: env.render() all_total_rewards.append(total_rewards) self.print('episode reward: {}'.format(total_rewards)) average_reward = sum(all_total_rewards) / len(all_total_rewards) self.print('average episode reward: {}'.format(average_reward))
def worker(self, wid): """Run a worker process.""" assert callable(self.env_maker) env = self.env_maker() # determine action mode from env.action_space if discrete_action(env.action_space): self.action_mode = 'discrete' self.action_dim = env.action_space.n elif continuous_action(env.action_space): self.action_mode = 'continuous' self.action_dim = len(env.action_space.shape) self.action_low = env.action_space.low self.action_high = env.action_space.high else: raise TypeError('Invalid type of env.action_space') self.is_master = wid == 0 if self.is_master and self.save_dir is not None: env_name = 'UnknownEnv-v0' if env.spec is None else env.spec.id self.output = self.get_output_dir(env_name) else: self.output = None # ports, cluster, and server cluster_list = ['{}:{}'.format(LOCALHOST, p) for p in self.port_list] cluster = tf.train.ClusterSpec({JOBNAME: cluster_list}) tf.train.Server(cluster, job_name=JOBNAME, task_index=wid) self.print('Starting server #{}'.format(wid)) self.setup_algorithm() # global/local devices worker_dev = '/job:{}/task:{}/cpu:0'.format(JOBNAME, wid) rep_dev = tf.train.replica_device_setter(worker_device=worker_dev, cluster=cluster) self.setup_nets(worker_dev, rep_dev, env) if self.replay_type is not None: replay_kwargs = {**REPLAY_KWARGS, **self.replay_kwargs} if self.is_master: self.print_kwargs(replay_kwargs, 'Replay memory arguments') if self.replay_type == 'uniform': self.replay = Replay(**replay_kwargs) elif self.replay_type == 'prioritized': self.replay = PriorityReplay(**replay_kwargs) else: message = 'replay type {} invalid'.format(self.replay_type) raise ValueError(message) # begin tensorflow session, build async RL agent and train port = self.port_list[wid] with tf.Session('grpc://{}:{}'.format(LOCALHOST, port)) as sess: sess.run(tf.global_variables_initializer()) self.set_session(sess) # train the agent self.train_on_env(env) if self.num_parallel > 1: self.event_finished.set() if self.is_master: while True: time.sleep(1)