예제 #1
0
파일: actor.py 프로젝트: iminders/maddpg
def get_actor_model(id, args, act_shapes, obs_shapes):
    logger.info("create actor nets for agent: %d" % id)
    input_size = obs_shapes[id]
    output_size = act_shapes[id]
    model = MLP(args.num_units, input_size, output_size)
    if args.print_net:
        model.summary()
    return model
예제 #2
0
def get_critic_model(id, args, act_shapes, obs_shapes):
    logger.info("create critic nets for agent: %d" % id)
    input_size = sum(obs_shapes) + sum(act_shapes)
    output_size = 1
    model = MLP(args.num_units, input_size, output_size)
    if args.print_net:
        model.summary()
    return model
예제 #3
0
파일: run.py 프로젝트: iminders/maddpg
def learn(args):
    env = make_env(args=args, id=0)
    agent = make_learner_agent(args, env.n, env.action_space,
                               env.observation_space)
    env = None
    serve(agent)
    agent.upload_minio()
    logger.info("Finished, tensorboard --logdir=%s" % agent.tb_dir)
예제 #4
0
def get_shapes(in_space):
    logger.info(str(in_space))
    from gym import spaces
    if isinstance(in_space[0], spaces.Box):
        return [space.shape[0] for space in in_space]
    if isinstance(in_space[0], spaces.Discrete):
        return [space.n for space in in_space]
    raise NotImplementedError
예제 #5
0
 def upload_minio(self):
     logger.info("upload model into minio")
     # upload tensorboard
     dest_obj_name = "exps/tensorboard/%s/%s.tar.gz" % (
         self.args.runner, self.args.run_id)
     self.stoarge.tar_and_fput(self.tb_dir, dest_obj_name)
     # upload model
     dest_obj_name = "exps/model/%s/%s.tar.gz" % (
         self.args.runner, self.args.run_id)
     self.stoarge.tar_and_fput(self.model_dir, dest_obj_name)
예제 #6
0
파일: run.py 프로젝트: iminders/maddpg
def explore_and_learn(args):
    logger.info("CPU count:{}".format(cpu_count()))
    processes = []
    # learn
    p = Process(target=learn, args=(args,))
    p.start()
    processes.append(p)
    # explore
    p = Process(target=parallel_explore, args=(args,))
    p.start()
    processes.append(p)

    for p in processes:
        p.join()
예제 #7
0
    def action(self, obs):
        self.step += len(obs)
        if self.step % self.decay_step == 0:
            self.sigma = max(self.sigma * self.decay_rate, self.min_sigma)
            logger.info("sigma decay to: %.3f,at %d" % (self.sigma, self.step))
        batch_obs = tf.convert_to_tensor(np.asarray(obs), dtype=tf.float32)

        acts = []
        for i in range(self.n):
            act = self.actors[i](batch_obs[:, i, :])
            noised_act = act + tf.random.normal(
                shape=act.shape, stddev=self.sigma, dtype=tf.float32)
            # TODO(liuwen): 根据act_space来clip
            acts.append(tf.clip_by_value(noised_act, -1.0, 1.0))
        acts_tf = tf.stack(acts, axis=1)
        return acts_tf
예제 #8
0
def explore(args, id):
    c = zmq.Context()
    s = c.socket(zmq.REQ)
    host = 'tcp://%s:%d' % (args.host, args.port)
    s.connect(host)
    logger.info('zmq socket addr: tcp://%s:%d' % (args.host, args.port))
    batch_env = BatchedEnvironment(args, id)
    obs = batch_env.reset()
    action = batch_env.uniform_action()
    i = 0
    n = args.env_batch_size
    episode = [0] * n
    episode_step = [0] * n

    while True:
        next_obs, rew, done, info = batch_env.step(action)
        i += n
        increment(episode_step, n)
        terminal = [episode_step[i] >= args.max_episode_len for i in range(n)]
        sample = [obs, action, next_obs, rew, done, terminal]
        p = pickle.dumps(sample)
        z = zlib.compress(p)
        while True:
            try:
                s.send_pyobj(z)
                data = s.recv_pyobj()
                action = pickle.loads(data)
                break
            except zmq.ZMQError:
                logger.error("send to zmq server[%s] error, sleep 1s" % host)
                time.sleep(1)
        if str(action) == "stop":
            logger.info("[%d],%d finished explore, learning server stoped" %
                        (id, i))
            break

        if i % (10 * args.save_rate) == 0:
            logger.debug("batch_env[%d] step:%i, episode:%s" %
                         (id, i, str(episode)))
        obs = batch_env.reset_if_done(done, terminal, episode_step, episode)
        if i % 10000 == 0:
            logger.debug(str(id) + ":" + str(episode))
예제 #9
0
    def __init__(self, args, agent_num, act_spaces, obs_spaces):
        super(Agent, self).__init__(args, agent_num, act_spaces, obs_spaces)
        logger.info("actors:act_shapes:%s, obs_shapes:%s" %
                    (str(self.act_shapes), str(self.obs_shapes)))

        self.actors = self.create_actors()
        self.target_actors = self.create_actors()
        logger.info("critics:act_shapes:%s, obs_shapes:%s" %
                    (str(self.act_shapes), str(self.obs_shapes)))
        self.critics = self.create_critics()
        self.target_critics = self.create_critics()
        self.sigma = args.sigma
        self.decay_step = args.decay_step
        self.decay_rate = args.decay_rate
        self.min_sigma = args.min_sigma
        self.actor_optimizers = [
            tf.keras.optimizers.Adam(learning_rate=args.plr, name='Adam')
            for i in range(self.n)
        ]
        self.critic_optimizers = [
            tf.keras.optimizers.Adam(learning_rate=args.qlr, name='Adam')
            for i in range(self.n)
        ]
예제 #10
0
파일: learner.py 프로젝트: iminders/maddpg
def serve(agent):
    logger.info("serve")
    c = zmq.Context()
    s = c.socket(zmq.REP)
    s.bind('tcp://127.0.0.1:%d' % agent.args.port)
    logger.info("zmq bind at tcp://0.0.0.0:%d" % agent.args.port)

    explore_size = agent.args.explore_size
    env_batch_size = agent.args.env_batch_size

    i, iter, episode, stop_client_num, record_i = 0, 0, 0, 0, 0

    episode_rews = [0] * agent.args.save_rate
    mean_reward = 0.0

    start = time.time()
    batch_start = time.time()
    log_start = time.time()

    with agent.writer.as_default():
        while True:
            z = s.recv_pyobj()
            p = zlib.decompress(z)
            data = pickle.loads(p)
            [obs, action, next_obs, rew, done, terminal] = data
            for j in range(env_batch_size):
                agent.buffer.add(obs[j], action[j], rew[j], next_obs[j],
                                 done[j])
            i += env_batch_size

            if i % explore_size == 0 and episode <= agent.args.warm_up:
                t = time.time()
                if episode < agent.args.save_rate:
                    mean_reward = 0.0
                else:
                    mean_reward = np.mean(episode_rews)
                logger.info(
                    get_explore_log(i, agent.args.warm_up, episode,
                                    mean_reward, t - batch_start, t - start))
                batch_start = t

            for j in range(env_batch_size):
                if all(done[j]) or terminal[j]:
                    episode += 1
                    loc = episode % agent.args.save_rate
                    episode_rews[loc] = np.sum(rew[j])
                    if episode % agent.args.save_rate == 0:
                        record_i += 1
                        mean_reward = np.mean(episode_rews)
                        if mean_reward > agent.best_score:
                            agent.best_score = mean_reward
                            agent.save()
                        tf.summary.scalar('1.performance/2.episode_reward',
                                          mean_reward, record_i)
                        if episode > agent.args.warm_up:
                            batch_end = time.time()
                            log_msg = get_train_log(i, episode,
                                                    agent.args.num_episodes,
                                                    mean_reward,
                                                    batch_end - log_start,
                                                    batch_end - start)
                            log_start = batch_end
                            logger.info(log_msg)

            if i % explore_size == 0 and episode > agent.args.warm_up:
                iter += 1
                explore_time = time.time() - batch_start
                logger.debug(
                    "serve collect %d explore samples spend %.3f secs" %
                    (agent.args.batch_size, explore_time))
                tf.summary.scalar('3.time/2.explore', explore_time, iter)
                agent.learn(iter)
                t = time.time()
                batch_start = t
            action = agent.action(next_obs)
            p = pickle.dumps(action)

            if episode >= agent.args.num_episodes:
                stop_client_num += 1
                logger.info("i=%d, episode=%d" % (i, episode))
                s.send_pyobj(pickle.dumps("stop"))
                if stop_client_num >= agent.args.num_env:
                    agent.writer.close()
                    break
            else:
                s.send_pyobj(p)
    s.close()
예제 #11
0
파일: learner.py 프로젝트: iminders/maddpg
def make_learner_agent(args=None, n=3, act_spaces=None, obs_spaces=None):
    logger.info("act_spaces:" + str(act_spaces))
    logger.info("obs_spaces:" + str(obs_spaces))
    agent = Agent(args, n, act_spaces, obs_spaces)
    return agent
예제 #12
0
파일: run.py 프로젝트: iminders/maddpg
    p = Process(target=learn, args=(args,))
    p.start()
    processes.append(p)
    # explore
    p = Process(target=parallel_explore, args=(args,))
    p.start()
    processes.append(p)

    for p in processes:
        p.join()


if __name__ == '__main__':
    args = parse_experiment_args()
    if args.debug:
        import logging
        logger.setLevel(logging.DEBUG)

    if args.role == EXPLORER:
        parallel_explore(args)

    if args.role == LEARNER:
        logger.info("parameters start" + "*" * 100)
        logger.info(str(args))
        logger.info("parameters end  " + "*" * 100)

        logger.info("set global_seeds: %s" % str(args.seed))
        set_global_seeds(args.seed)
        explore_and_learn(args)
        # learn(args)