def make_vec_env(env_id, env_type, num_env, seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()
    def make_thunk(rank):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            logger_dir=logger_dir
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(start_index)])
示例#2
0
def main():
    FLAGS(sys.argv)
    env = SubprocVecEnv(1, 'CollectMineralShards')
    env.reset()
    total_reward = 0
    for _ in range(1000):
        marine = random.randrange(2)
        x = random.randrange(32)
        y = random.randrange(32)
        print('Move %d to (%d, %d)' % (marine, x, y))
        move_action = construct_action(marine, x, y)
        # This controls the APM.
        for _ in range(7):
            obs, rs, dones, _, _, _, selected, screens = env.step(
                [move_action])
            total_reward += rs
        # Querying the position
        m_pos = {}
        m_pos['0'], rs, dones = get_position(env, 0)
        total_reward += rs
        m_pos['1'], rs, dones = get_position(env, 1)
        total_reward += rs

        print(rs)
        print(dones)
        print('Total reward: ', total_reward)
        print(m_pos)

    env.close()
示例#3
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None,
                 cloth_cfg_path=None,
                 render_path=None,
                 start_state_path=None):
    """Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.

    Daniel: the above docs from baselines seems out of date, ALL types go here?
    Also, we're adding arguments for the cloth env: the config path, the render
    path, and the starting state path (last one is optional for the cloth).
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()

    def make_thunk(rank,
                   cloth_cfg_path=None,
                   render_path=None,
                   start_state_path=None):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            logger_dir=logger_dir,
            cloth_cfg_path=cloth_cfg_path,
            render_path=render_path,
            start_state_path=start_state_path,
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([
            make_thunk(
                i + start_index,
                cloth_cfg_path=cloth_cfg_path,
                render_path=None,  # Daniel: for now
                start_state_path=start_state_path) for i in range(num_env)
        ])
    else:
        return DummyVecEnv([
            make_thunk(start_index,
                       cloth_cfg_path,
                       render_path=render_path,
                       start_state_path=start_state_path)
        ])
示例#4
0
    def __init__(self,
                 sess,
                 log_path,
                 env_function,
                 num_cpu,
                 network="fullyconv",
                 ar=True,
                 lr=1e-4,
                 optimizer="rmsprop",
                 ent_coef=1e-3,
                 vf_coef=1.0,
                 max_grad_norm=0.5,
                 nsteps=5,
                 nstack=1,
                 gamma=0.99):

        if optimizer == "adam":
            self.trainer = tf.train.AdamOptimizer
        elif optimizer == "rmsprop":
            self.trainer = tf.train.RMSPropOptimizer
        else:
            raise NotImplementedError

        network_func = None
        if network == "fullyconv":
            network_func = networks.FullyConvNet
        elif network == "atari":
            network_func = networks.AtariNet
        else:
            raise NotImplementedError

        self.sess = sess
        self.log_path = log_path
        self.num_cpu = num_cpu
        self.env_function = env_function
        self.init_lr = lr
        self.env = SubprocVecEnv([self.env_function(i) for i in range(1)])
        self.model = Model(network_func=network_func,
                           screen_space=self.env.screen_space,
                           minimap_space=self.env.minimap_space,
                           ns_space=self.env.ns_space,
                           trainer=self.trainer,
                           ar=ar,
                           nstack=nstack,
                           ent_coef=ent_coef,
                           vf_coef=vf_coef,
                           max_grad_norm=max_grad_norm)
        self.gamma = gamma
        self.nsteps = nsteps
        self.ar = ar

        if ar:
            self.step_func = self.step_policy_ar
        else:
            self.step_func = self.step_policy
示例#5
0
        def make_vec_envs(evaluation):
            def env_thunk(rank):
                return lambda: self.make_env(seed=int(seed),
                                             rank=rank,
                                             evaluation=evaluation,
                                             env_id=env_id)

            env_fns = [env_thunk(i) for i in range(num_processes)]
            use_dummy = len(
                env_fns) == 1 or sys.platform == "darwin" or synchronous
            return VecPyTorch(
                DummyVecEnv(env_fns, render=render
                            ) if use_dummy else SubprocVecEnv(env_fns))
示例#6
0
文件: train.py 项目: oidelima/ppo
    def make_vec_envs(
        self,
        num_processes,
        gamma,
        render,
        synchronous,
        env_id,
        add_timestep,
        seed,
        evaluation,
        time_limit,
        num_frame_stack=None,
        **env_args,
    ):
        envs = [
            functools.partial(  # thunk
                self.make_env,
                rank=i,
                env_id=env_id,
                add_timestep=add_timestep,
                seed=seed,
                evaluation=evaluation,
                time_limit=time_limit,
                evaluating=evaluation,
                **env_args,
            ) for i in range(num_processes)
        ]

        if len(envs) == 1 or sys.platform == "darwin" or synchronous:
            envs = DummyVecEnv(envs, render=render)
        else:
            envs = SubprocVecEnv(envs)

        # if (
        # envs.observation_space.shape
        # and len(envs.observation_space.shape) == 1
        # ):
        # if gamma is None:
        # envs = VecNormalize(envs, ret=False)
        # else:
        # envs = VecNormalize(envs, gamma=gamma)

        envs = VecPyTorch(envs)

        if num_frame_stack is not None:
            envs = VecPyTorchFrameStack(envs, num_frame_stack)
        # elif len(envs.observation_space.shape) == 3:
        #     envs = VecPyTorchFrameStack(envs, 4, device)

        return envs
示例#7
0
def build_env4gail(args, nenv):
    def make_env():
        def _thunk():
            env = gym.make(args.env_id)
            env.seed(args.seed)  # to make the result more reproducibility
            env = Monitor(env, logger.get_dir(), allow_early_resets=True)
            return env

        return _thunk

    envs = [make_env() for i in range(nenv)]
    envs = SubprocVecEnv(envs)
    envs = VecNormalize(envs)

    return envs
示例#8
0
def test_env_after_learn(algo):
    def make_env():
        # acktr requires too much RAM, fails on travis
        env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)

    env.reset()
    env.close()
示例#9
0
def make_atari_env(env_id, num_threads, seed, frame_stack=4):
    game_lives = gym.make(env_id).unwrapped.ale.lives()
    game_lives = game_lives if game_lives != 0 else 1

    def make_env(rank):
        def _thunk():
            env = wrappers.make_atari(env_id)
            env.seed(seed + rank)
            return wrappers.wrap_deepmind(env)

        return _thunk

    np.random.seed(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_threads)])
    env = VecFrameStack(env, frame_stack)
    return env, game_lives
示例#10
0
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return wrap_deepmind(env, **wrapper_kwargs)

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
示例#11
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id) if env_type == 'atari' else gym.make(
                env_id)
            env.seed(seed + 10000 * mpi_rank +
                     rank if seed is not None else None)
            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(),
                                           str(mpi_rank) + '.' + str(rank)),
                          allow_early_resets=True)

            if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs)
            elif reward_scale != 1: return RewardScaler(env, reward_scale)
            else: return env

        return _thunk

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv(
            [make_env(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_env(start_index)])
示例#12
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            visualize=True,
                            screen_size_px=(16, 16),
                            minimap_size_px=(16, 16)) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "deepq-4way"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            screen_size_px=(32, 32),
                            minimap_size_px=(32, 32),
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
def main():
    FLAGS(sys.argv)

    steps = 0  #Test steps

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if FLAGS.lr == 0:
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if FLAGS.algorithm == "deepq-4way":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "deepq":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "a2c":
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if FLAGS.algorithm == "deepq":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16))
        # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting
        # We need this change because sc2 now requires specifying players.
        with sc2_env.SC2Env(
                map_name="Simple64",
                players=[
                    sc2_env.Agent(race=sc2_env.Race.terran),
                    sc2_env.Agent(race=sc2_env.Race.terran)
                ],
                #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)],
                step_mul=step_mul,
                visualize=True,
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            acts = deepq_nexus_wars.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)

            agent = random_agent.RandomAgent()
            run_loop.run_loop([agent], env, steps)

            acts[0].save("mineral_shards_x.pkl")
            acts[1].save("mineral_shards_y.pkl")

    elif FLAGS.algorithm == "deepq-4way":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(map_name="Simple64",
                            players=[
                                sc2_env.Agent(race=sc2_env.Race.terran),
                                sc2_env.Agent(race=sc2_env.Race.terran)
                            ],
                            step_mul=step_mul,
                            agent_interface_format=AGENT_INTERFACE_FORMAT,
                            visualize=True) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif FLAGS.algorithm == "a2c":

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
示例#14
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_cpu : %s" % FLAGS.num_cpu)
    print("lr : %s" % FLAGS.lr)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env("CollectMineralShards",
                            step_mul=step_mul,
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(env,
                                             q_func=model,
                                             num_actions=64,
                                             lr=1e-3,
                                             max_timesteps=20000000,
                                             buffer_size=10000,
                                             exploration_fraction=0.5,
                                             exploration_final_eps=0.01,
                                             train_freq=4,
                                             learning_starts=10000,
                                             target_network_update_freq=1000,
                                             gamma=0.99,
                                             prioritized_replay=True,
                                             callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "acktr"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        # def make_env(rank):
        #   # env = sc2_env.SC2Env(
        #   #   "CollectMineralShards",
        #   #   step_mul=step_mul)
        #   # return env
        #   #env.seed(seed + rank)
        #   def _thunk():
        #     env = sc2_env.SC2Env(
        #         map_name=FLAGS.map,
        #         step_mul=step_mul,
        #         visualize=True)
        #     #env.seed(seed + rank)
        #     if logger.get_dir():
        #      env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
        #     return env
        #   return _thunk

        # agents = [Agent()
        #           for _ in range(num_cpu)]
        #
        # for agent in agents:
        #   time.sleep(1)
        #   agent.daemon = True
        #   agent.start()

        # agent_controller = AgentController(agents)

        #set_global_seeds(seed)
        env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map)

        policy_fn = CnnPolicy
        acktr_disc.learn(policy_fn,
                         env,
                         seed,
                         total_timesteps=num_timesteps,
                         nprocs=FLAGS.num_cpu,
                         ent_coef=0.1,
                         callback=acktr_callback)
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(  #interface.feature_layer.resolution 和  interface.feature_layer.minimap_resolution
            feature_dimensions=sc2_env.Dimensions(screen=32,
                                                  minimap=32)  # 16 16
            # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32)  # 16 16
        )
        with sc2_env.SC2Env(
                map_name="CollectMineralShards",
                step_mul=step_mul,  #推进的速度,通俗理解就是人类玩家的每秒的有效操作
                visualize=True,
                # screen_size_px=(16, 16),
                # minimap_size_px=(16, 16)) as env:
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = deepq.models.cnn_to_mlp(  #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model
                convs=[(16, 8, 4), (32, 4, 2)],
                hiddens=[256],
                dueling=True)  #卷积核数量,卷积核大小,步长
            # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True)  # 卷积核数量,卷积核大小,步长
            act = deepq_mineral_shards.learn(  #训练模型并保存
                # act = deepq_ActSeparate.learn(  #训练模型并保存
                # act=deepq_actSeparateWith4Directions.learn(
                # act = deepq_actionGroup_4way.learn(
                # act = deep_DiffActInSameTime.learn(
                env,
                q_func=model,
                num_actions=4,  #default 16  num_actions=256   3  4
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_actSeparateWith4Directions_callback
            )  #deepq_callback; deepq_ActSeperate_callback  ;   deepq_actSeparateWith4Directions_callback  deep_DiffActInSameTime_callback
            act.save(
                "mineral_shards.pkl"
            )  #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py

    elif (FLAGS.algorithm == "deepq-4way"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(  #
                map_name="CollectMineralShards",
                step_mul=step_mul,
                # screen_size_px=(32, 32),
                # minimap_size_px=(32, 32),
                save_replay_episodes=2,
                replay_dir="D:/StarCraft II/StarCraft II/video",
                agent_interface_format=AGENT_INTERFACE_FORMAT,
                visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)
            # model = deepq.models.mlp(hiddens=[256,128,4])
            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
示例#16
0
 def remake_env(self, num_cpu):
     self.env.close()
     self.env = SubprocVecEnv(
         [self.env_function(i) for i in range(num_cpu)])