def main():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name="CollectMineralShards",
            step_mul=step_mul,
            players=[sc2_env.Agent(sc2_env.Race.terran)],
            agent_interface_format=sc2_env.AgentInterfaceFormat(
                feature_dimensions=sc2_env.Dimensions(screen=64, minimap=64)),
            visualize=True) as env:

        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)

        act = deepq_mineral_shards.learn(env,
                                         q_func=model,
                                         num_actions=4,
                                         lr=1e-5,
                                         max_timesteps=2000000,
                                         buffer_size=100000,
                                         exploration_fraction=0.5,
                                         exploration_final_eps=0.01,
                                         train_freq=4,
                                         learning_starts=100000,
                                         target_network_update_freq=1000,
                                         gamma=0.99,
                                         prioritized_replay=True)
        act.save("mineral_shards.pkl")
Exemplo n.º 2
0
def main():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="CollectMineralShards",
                        step_mul=step_mul) as env:
        # CNN
        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)

        act = deepq_mineral_shards.learn(
            env,
            q_func=model,
            num_actions=4,  # 輸出的動作只有up, down, left, right
            lr=1e-5,
            max_timesteps=2000000,
            buffer_size=100000,
            exploration_fraction=0.5,
            exploration_final_eps=0.01,
            train_freq=4,
            learning_starts=100000,
            target_network_update_freq=1000,
            gamma=0.99,
            prioritized_replay=True)
        act.save("mineral_shards.pkl")
Exemplo n.º 3
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            visualize=True,
                            screen_size_px=(16, 16),
                            minimap_size_px=(16, 16)) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "deepq-4way"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            screen_size_px=(32, 32),
                            minimap_size_px=(32, 32),
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
Exemplo n.º 4
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_cpu : %s" % FLAGS.num_cpu)
    print("lr : %s" % FLAGS.lr)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env("CollectMineralShards",
                            step_mul=step_mul,
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(env,
                                             q_func=model,
                                             num_actions=64,
                                             lr=1e-3,
                                             max_timesteps=20000000,
                                             buffer_size=10000,
                                             exploration_fraction=0.5,
                                             exploration_final_eps=0.01,
                                             train_freq=4,
                                             learning_starts=10000,
                                             target_network_update_freq=1000,
                                             gamma=0.99,
                                             prioritized_replay=True,
                                             callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "acktr"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        # def make_env(rank):
        #   # env = sc2_env.SC2Env(
        #   #   "CollectMineralShards",
        #   #   step_mul=step_mul)
        #   # return env
        #   #env.seed(seed + rank)
        #   def _thunk():
        #     env = sc2_env.SC2Env(
        #         map_name=FLAGS.map,
        #         step_mul=step_mul,
        #         visualize=True)
        #     #env.seed(seed + rank)
        #     if logger.get_dir():
        #      env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
        #     return env
        #   return _thunk

        # agents = [Agent()
        #           for _ in range(num_cpu)]
        #
        # for agent in agents:
        #   time.sleep(1)
        #   agent.daemon = True
        #   agent.start()

        # agent_controller = AgentController(agents)

        #set_global_seeds(seed)
        env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map)

        policy_fn = CnnPolicy
        acktr_disc.learn(policy_fn,
                         env,
                         seed,
                         total_timesteps=num_timesteps,
                         nprocs=FLAGS.num_cpu,
                         ent_coef=0.1,
                         callback=acktr_callback)
Exemplo n.º 5
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16))
        # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting
        # We need this change because sc2 now requires specifying players.
        with sc2_env.SC2Env(
                map_name="CollectMineralShards",
                players=[sc2_env.Agent(sc2_env.Race.terran)],
                step_mul=step_mul,
                visualize=True,
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            acts = deepq_mineral_shards.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)
            acts[0].save("mineral_shards_x.pkl")
            acts[1].save("mineral_shards_y.pkl")

    elif (FLAGS.algorithm == "deepq-4way"):

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(map_name="CollectMineralShards",
                            step_mul=step_mul,
                            agent_interface_format=AGENT_INTERFACE_FORMAT,
                            visualize=True) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
Exemplo n.º 6
0
def main():
  FLAGS(sys.argv)

  print("algorithm : %s" % FLAGS.algorithm)
  print("timesteps : %s" % FLAGS.timesteps)
  print("exploration_fraction : %s" % FLAGS.exploration_fraction)
  print("prioritized : %s" % FLAGS.prioritized)
  print("dueling : %s" % FLAGS.dueling)
  print("num_agents : %s" % FLAGS.num_agents)
  print("lr : %s" % FLAGS.lr)

  if (FLAGS.lr == 0):
    FLAGS.lr = random.uniform(0.00001, 0.001)

  print("random lr : %s" % FLAGS.lr)

  lr_round = round(FLAGS.lr, 8)

  # logdir = "tensorboard"

  # if FLAGS.algorithm == "deepq-4way":
  #     logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
  #         FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
  #         FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
  # elif FLAGS.algorithm == "deepq":
  #     logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
  #         FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
  #         FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
  # elif FLAGS.algorithm == "a2c":
  #     logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
  #         FLAGS.algorithm, FLAGS.timesteps,
  #         FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
  #         FLAGS.nsteps, lr_round, start_time)

  # logdir = os.path.join(PROJ_DIR, logdir)
  # if FLAGS.log == "tensorboard":
  #     Logger.DEFAULT \
  #         = Logger.CURRENT \
  #         = Logger(dir=None,
  #                 output_formats=[TensorBoardOutputFormat(logdir)])
  # elif FLAGS.log == "stdout":
  #     Logger.DEFAULT \
  #         = Logger.CURRENT \
  #         = Logger(dir=None,
  #                  output_formats=[HumanOutputFormat(sys.stdout)])

  if FLAGS.algorithm == "deepq":

    with sc2_env.SC2Env(
        map_name="CollectMineralShards",
        step_mul=step_mul,
        visualize=True,
        screen_size_px=(16, 16),
        minimap_size_px=(16, 16)) as env:

      model = deepq.models.cnn_to_mlp(
          convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True)

      act = deepq_mineral_shards.learn(
          env,
          q_func=model,
          num_actions=16,
          lr=FLAGS.lr,
          max_timesteps=FLAGS.timesteps,
          buffer_size=10000,
          exploration_fraction=FLAGS.exploration_fraction,
          exploration_final_eps=0.01,
          train_freq=4,
          learning_starts=10000,
          target_network_update_freq=1000,
          gamma=0.99,
          prioritized_replay=True,
          callback=deepq_callback)
      act.save("mineral_shards.pkl")

  elif FLAGS.algorithm == "deepq-4way":

    with sc2_env.SC2Env(
        map_name="CollectMineralShards",
        step_mul=step_mul,
        screen_size_px=(32, 32),
        minimap_size_px=(32, 32),
        visualize=True) as env:

      model = deepq.models.cnn_to_mlp(
          convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True)

      act = deepq_mineral_4way.learn(
          env,
          q_func=model,
          num_actions=4,
          lr=FLAGS.lr,
          max_timesteps=FLAGS.timesteps,
          buffer_size=10000,
          exploration_fraction=FLAGS.exploration_fraction,
          exploration_final_eps=0.01,
          train_freq=4,
          learning_starts=10000,
          target_network_update_freq=1000,
          gamma=0.99,
          prioritized_replay=True,
          callback=deepq_4way_callback)

      act.save("mineral_shards.pkl")

  elif FLAGS.algorithm == "a2c":

    num_timesteps = int(40e6)

    num_timesteps //= 4

    seed = 0

    env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
     FLAGS.num_scripts, FLAGS.map)

    policy_fn = CnnPolicy
    a2c.learn(
        policy_fn,
        env,
        seed,
        total_timesteps=num_timesteps,
        nprocs=FLAGS.num_agents + FLAGS.num_scripts,
        nscripts=FLAGS.num_scripts,
        ent_coef=FLAGS.ent_coef,
        vf_coef=FLAGS.vf_coef,
        nsteps=FLAGS.nsteps,
        max_grad_norm=FLAGS.max_grad_norm,
        callback=a2c_callback)
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(  #interface.feature_layer.resolution 和  interface.feature_layer.minimap_resolution
            feature_dimensions=sc2_env.Dimensions(screen=32,
                                                  minimap=32)  # 16 16
            # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32)  # 16 16
        )
        with sc2_env.SC2Env(
                map_name="CollectMineralShards",
                step_mul=step_mul,  #推进的速度,通俗理解就是人类玩家的每秒的有效操作
                visualize=True,
                # screen_size_px=(16, 16),
                # minimap_size_px=(16, 16)) as env:
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = deepq.models.cnn_to_mlp(  #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model
                convs=[(16, 8, 4), (32, 4, 2)],
                hiddens=[256],
                dueling=True)  #卷积核数量,卷积核大小,步长
            # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True)  # 卷积核数量,卷积核大小,步长
            act = deepq_mineral_shards.learn(  #训练模型并保存
                # act = deepq_ActSeparate.learn(  #训练模型并保存
                # act=deepq_actSeparateWith4Directions.learn(
                # act = deepq_actionGroup_4way.learn(
                # act = deep_DiffActInSameTime.learn(
                env,
                q_func=model,
                num_actions=4,  #default 16  num_actions=256   3  4
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_actSeparateWith4Directions_callback
            )  #deepq_callback; deepq_ActSeperate_callback  ;   deepq_actSeparateWith4Directions_callback  deep_DiffActInSameTime_callback
            act.save(
                "mineral_shards.pkl"
            )  #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py

    elif (FLAGS.algorithm == "deepq-4way"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(  #
                map_name="CollectMineralShards",
                step_mul=step_mul,
                # screen_size_px=(32, 32),
                # minimap_size_px=(32, 32),
                save_replay_episodes=2,
                replay_dir="D:/StarCraft II/StarCraft II/video",
                agent_interface_format=AGENT_INTERFACE_FORMAT,
                visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)
            # model = deepq.models.mlp(hiddens=[256,128,4])
            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)