def get_actor_model(id, args, act_shapes, obs_shapes): logger.info("create actor nets for agent: %d" % id) input_size = obs_shapes[id] output_size = act_shapes[id] model = MLP(args.num_units, input_size, output_size) if args.print_net: model.summary() return model
def get_critic_model(id, args, act_shapes, obs_shapes): logger.info("create critic nets for agent: %d" % id) input_size = sum(obs_shapes) + sum(act_shapes) output_size = 1 model = MLP(args.num_units, input_size, output_size) if args.print_net: model.summary() return model
def learn(args): env = make_env(args=args, id=0) agent = make_learner_agent(args, env.n, env.action_space, env.observation_space) env = None serve(agent) agent.upload_minio() logger.info("Finished, tensorboard --logdir=%s" % agent.tb_dir)
def get_shapes(in_space): logger.info(str(in_space)) from gym import spaces if isinstance(in_space[0], spaces.Box): return [space.shape[0] for space in in_space] if isinstance(in_space[0], spaces.Discrete): return [space.n for space in in_space] raise NotImplementedError
def upload_minio(self): logger.info("upload model into minio") # upload tensorboard dest_obj_name = "exps/tensorboard/%s/%s.tar.gz" % ( self.args.runner, self.args.run_id) self.stoarge.tar_and_fput(self.tb_dir, dest_obj_name) # upload model dest_obj_name = "exps/model/%s/%s.tar.gz" % ( self.args.runner, self.args.run_id) self.stoarge.tar_and_fput(self.model_dir, dest_obj_name)
def explore_and_learn(args): logger.info("CPU count:{}".format(cpu_count())) processes = [] # learn p = Process(target=learn, args=(args,)) p.start() processes.append(p) # explore p = Process(target=parallel_explore, args=(args,)) p.start() processes.append(p) for p in processes: p.join()
def action(self, obs): self.step += len(obs) if self.step % self.decay_step == 0: self.sigma = max(self.sigma * self.decay_rate, self.min_sigma) logger.info("sigma decay to: %.3f,at %d" % (self.sigma, self.step)) batch_obs = tf.convert_to_tensor(np.asarray(obs), dtype=tf.float32) acts = [] for i in range(self.n): act = self.actors[i](batch_obs[:, i, :]) noised_act = act + tf.random.normal( shape=act.shape, stddev=self.sigma, dtype=tf.float32) # TODO(liuwen): 根据act_space来clip acts.append(tf.clip_by_value(noised_act, -1.0, 1.0)) acts_tf = tf.stack(acts, axis=1) return acts_tf
def explore(args, id): c = zmq.Context() s = c.socket(zmq.REQ) host = 'tcp://%s:%d' % (args.host, args.port) s.connect(host) logger.info('zmq socket addr: tcp://%s:%d' % (args.host, args.port)) batch_env = BatchedEnvironment(args, id) obs = batch_env.reset() action = batch_env.uniform_action() i = 0 n = args.env_batch_size episode = [0] * n episode_step = [0] * n while True: next_obs, rew, done, info = batch_env.step(action) i += n increment(episode_step, n) terminal = [episode_step[i] >= args.max_episode_len for i in range(n)] sample = [obs, action, next_obs, rew, done, terminal] p = pickle.dumps(sample) z = zlib.compress(p) while True: try: s.send_pyobj(z) data = s.recv_pyobj() action = pickle.loads(data) break except zmq.ZMQError: logger.error("send to zmq server[%s] error, sleep 1s" % host) time.sleep(1) if str(action) == "stop": logger.info("[%d],%d finished explore, learning server stoped" % (id, i)) break if i % (10 * args.save_rate) == 0: logger.debug("batch_env[%d] step:%i, episode:%s" % (id, i, str(episode))) obs = batch_env.reset_if_done(done, terminal, episode_step, episode) if i % 10000 == 0: logger.debug(str(id) + ":" + str(episode))
def __init__(self, args, agent_num, act_spaces, obs_spaces): super(Agent, self).__init__(args, agent_num, act_spaces, obs_spaces) logger.info("actors:act_shapes:%s, obs_shapes:%s" % (str(self.act_shapes), str(self.obs_shapes))) self.actors = self.create_actors() self.target_actors = self.create_actors() logger.info("critics:act_shapes:%s, obs_shapes:%s" % (str(self.act_shapes), str(self.obs_shapes))) self.critics = self.create_critics() self.target_critics = self.create_critics() self.sigma = args.sigma self.decay_step = args.decay_step self.decay_rate = args.decay_rate self.min_sigma = args.min_sigma self.actor_optimizers = [ tf.keras.optimizers.Adam(learning_rate=args.plr, name='Adam') for i in range(self.n) ] self.critic_optimizers = [ tf.keras.optimizers.Adam(learning_rate=args.qlr, name='Adam') for i in range(self.n) ]
def serve(agent): logger.info("serve") c = zmq.Context() s = c.socket(zmq.REP) s.bind('tcp://127.0.0.1:%d' % agent.args.port) logger.info("zmq bind at tcp://0.0.0.0:%d" % agent.args.port) explore_size = agent.args.explore_size env_batch_size = agent.args.env_batch_size i, iter, episode, stop_client_num, record_i = 0, 0, 0, 0, 0 episode_rews = [0] * agent.args.save_rate mean_reward = 0.0 start = time.time() batch_start = time.time() log_start = time.time() with agent.writer.as_default(): while True: z = s.recv_pyobj() p = zlib.decompress(z) data = pickle.loads(p) [obs, action, next_obs, rew, done, terminal] = data for j in range(env_batch_size): agent.buffer.add(obs[j], action[j], rew[j], next_obs[j], done[j]) i += env_batch_size if i % explore_size == 0 and episode <= agent.args.warm_up: t = time.time() if episode < agent.args.save_rate: mean_reward = 0.0 else: mean_reward = np.mean(episode_rews) logger.info( get_explore_log(i, agent.args.warm_up, episode, mean_reward, t - batch_start, t - start)) batch_start = t for j in range(env_batch_size): if all(done[j]) or terminal[j]: episode += 1 loc = episode % agent.args.save_rate episode_rews[loc] = np.sum(rew[j]) if episode % agent.args.save_rate == 0: record_i += 1 mean_reward = np.mean(episode_rews) if mean_reward > agent.best_score: agent.best_score = mean_reward agent.save() tf.summary.scalar('1.performance/2.episode_reward', mean_reward, record_i) if episode > agent.args.warm_up: batch_end = time.time() log_msg = get_train_log(i, episode, agent.args.num_episodes, mean_reward, batch_end - log_start, batch_end - start) log_start = batch_end logger.info(log_msg) if i % explore_size == 0 and episode > agent.args.warm_up: iter += 1 explore_time = time.time() - batch_start logger.debug( "serve collect %d explore samples spend %.3f secs" % (agent.args.batch_size, explore_time)) tf.summary.scalar('3.time/2.explore', explore_time, iter) agent.learn(iter) t = time.time() batch_start = t action = agent.action(next_obs) p = pickle.dumps(action) if episode >= agent.args.num_episodes: stop_client_num += 1 logger.info("i=%d, episode=%d" % (i, episode)) s.send_pyobj(pickle.dumps("stop")) if stop_client_num >= agent.args.num_env: agent.writer.close() break else: s.send_pyobj(p) s.close()
def make_learner_agent(args=None, n=3, act_spaces=None, obs_spaces=None): logger.info("act_spaces:" + str(act_spaces)) logger.info("obs_spaces:" + str(obs_spaces)) agent = Agent(args, n, act_spaces, obs_spaces) return agent
p = Process(target=learn, args=(args,)) p.start() processes.append(p) # explore p = Process(target=parallel_explore, args=(args,)) p.start() processes.append(p) for p in processes: p.join() if __name__ == '__main__': args = parse_experiment_args() if args.debug: import logging logger.setLevel(logging.DEBUG) if args.role == EXPLORER: parallel_explore(args) if args.role == LEARNER: logger.info("parameters start" + "*" * 100) logger.info(str(args)) logger.info("parameters end " + "*" * 100) logger.info("set global_seeds: %s" % str(args.seed)) set_global_seeds(args.seed) explore_and_learn(args) # learn(args)