Пример #1
0
def run_sequential(args, logger):
    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.obs_shape = env_info["obs_shape"]

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "role_avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
        "roles": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        }
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        episode_batch = runner.run(test_mode=False)
        buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # Truncate batch to only filled timesteps
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Пример #2
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    #args.action_space = env_info["action_space"]
    args.action_spaces = env_info["action_spaces"]
    args.actions_dtype = env_info["actions_dtype"]
    args.normalise_actions = env_info.get("normalise_actions",
                                                False) # if true, action vectors need to sum to one


    # create function scaling agent action tensors to and from range [0,1]
    ttype = th.FloatTensor if not args.use_cuda else th.cuda.FloatTensor
    mult_coef_tensor = ttype(args.n_agents, args.n_actions)
    action_min_tensor = ttype(args.n_agents, args.n_actions)
    if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]):
        for _aid in range(args.n_agents):
            for _actid in range(args.action_spaces[_aid].shape[0]):
                _action_min = args.action_spaces[_aid].low[_actid]
                _action_max = args.action_spaces[_aid].high[_actid]
                mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min)
                action_min_tensor[_aid, _actid] = np.asscalar(_action_min)
    elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]):    # NOTE: This was added to handle scenarios like simple_reference since the action space is Tuple
        for _aid in range(args.n_agents):
            for _actid in range(args.action_spaces[_aid].spaces[0].shape[0]):
                _action_min = args.action_spaces[_aid].spaces[0].low[_actid]
                _action_max = args.action_spaces[_aid].spaces[0].high[_actid]
                mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min)
                action_min_tensor[_aid, _actid] = np.asscalar(_action_min)
            for _actid in range(args.action_spaces[_aid].spaces[1].shape[0]):
                _action_min = args.action_spaces[_aid].spaces[1].low[_actid]
                _action_max = args.action_spaces[_aid].spaces[1].high[_actid]
                tmp_idx = _actid + args.action_spaces[_aid].spaces[0].shape[0]
                mult_coef_tensor[_aid, tmp_idx] = np.asscalar(_action_max - _action_min)
                action_min_tensor[_aid, tmp_idx] = np.asscalar(_action_min)

    args.actions2unit_coef = mult_coef_tensor
    args.actions2unit_coef_cpu = mult_coef_tensor.cpu()
    args.actions2unit_coef_numpy = mult_coef_tensor.cpu().numpy()
    args.actions_min = action_min_tensor
    args.actions_min_cpu = action_min_tensor.cpu()
    args.actions_min_numpy = action_min_tensor.cpu().numpy()

    def actions_to_unit_box(actions):
        if isinstance(actions, np.ndarray):
            return args.actions2unit_coef_numpy * actions + args.actions_min_numpy
        elif actions.is_cuda:
            return args.actions2unit_coef * actions + args.actions_min
        else:
            return args.args.actions2unit_coef_cpu  * actions + args.actions_min_cpu

    def actions_from_unit_box(actions):
        if isinstance(actions, np.ndarray):
            return th.div((actions - args.actions_min_numpy), args.actions2unit_coef_numpy)
        elif actions.is_cuda:
            return th.div((actions - args.actions_min), args.actions2unit_coef)
        else:
            return th.div((actions - args.actions_min_cpu), args.actions2unit_coef_cpu)

    # make conversion functions globally available
    args.actions2unit = actions_to_unit_box
    args.unit2actions = actions_from_unit_box

    action_dtype = th.long if not args.actions_dtype == np.float32 else th.float
    if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]):
        actions_vshape = 1 if not args.actions_dtype == np.float32 else max([i.shape[0] for i in args.action_spaces])
    elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]):
        actions_vshape = 1 if not args.actions_dtype == np.float32 else \
                                       max([i.spaces[0].shape[0] + i.spaces[1].shape[0] for i in args.action_spaces])
    # Default/Base scheme
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (actions_vshape,), "group": "agents", "dtype": action_dtype},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "reward": {"vshape": (1,)},
        "terminated": {"vshape": (1,), "dtype": th.uint8},
    }
    groups = {
        "agents": args.n_agents
    }

    if not args.actions_dtype == np.float32:
        preprocess = {
            "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
        }
    else:
        preprocess = {}

    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1 if args.runner_scope == "episodic" else 2,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = - args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max))

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        if getattr(args, "runner_scope", "episodic") == "episodic":
            episode_batch = runner.run(test_mode=False, learner=learner)
            buffer.insert_episode_batch(episode_batch)

            if buffer.can_sample(args.batch_size) and (buffer.episodes_in_buffer > getattr(args, "buffer_warmup", 0)):
                episode_sample = buffer.sample(args.batch_size)

                # Truncate batch to only filled timesteps
                max_ep_t = episode_sample.max_t_filled()
                episode_sample = episode_sample[:, :max_ep_t]

                if episode_sample.device != args.device:
                    episode_sample.to(args.device)

                learner.train(episode_sample, runner.t_env, episode)
        elif getattr(args, "runner_scope", "episode") == "transition":
            runner.run(test_mode=False,
                       buffer=buffer,
                       learner=learner,
                       episode=episode)
        else:
            raise Exception("Undefined runner scope!")

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            if getattr(args, "testing_on", True):
                for _ in range(n_test_runs):
                    if getattr(args, "runner_scope", "episodic") == "episodic":
                        runner.run(test_mode=True, learner=learner)
                    elif getattr(args, "runner_scope", "episode") == "transition":
                        runner.run(test_mode=True,
                                   buffer = buffer,
                                   learner = learner,
                                   episode = episode)
                    else:
                        raise Exception("Undefined runner scope!")

        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            # learner.save_models(save_path, args.unique_token, model_save_time)

            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Пример #3
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.episode_limit = env_info["episode_limit"]
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.unit_dim = env_info["unit_dim"]

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    env_name = args.env
    if env_name == 'sc2':
        env_name += '/' + args.env_args['map_name']

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        args.burn_in_period,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    if args.is_save_buffer:
        save_buffer = ReplayBuffer(
            scheme,
            groups,
            args.save_buffer_size,
            env_info["episode_limit"] + 1,
            args.burn_in_period,
            preprocess=preprocess,
            device="cpu" if args.buffer_cpu_only else args.device)

    if args.is_batch_rl:
        assert (args.is_save_buffer == False)
        x_env_name = env_name
        if args.is_from_start:
            x_env_name += '_from_start/'
        path_name = '../../buffer/' + x_env_name + '/buffer_' + str(
            args.load_buffer_id) + '/'
        assert (os.path.exists(path_name) == True)
        buffer.load(path_name)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    if args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3' \
            or args.env == 'mmdp_game_1':
        last_demo_T = -args.demo_interval - 1

    while runner.t_env <= args.t_max:

        if not args.is_batch_rl:
            # Run for a whole episode at a time
            episode_batch = runner.run(test_mode=False)
            buffer.insert_episode_batch(episode_batch)

            if args.is_save_buffer:
                save_buffer.insert_episode_batch(episode_batch)
                if save_buffer.is_from_start and save_buffer.episodes_in_buffer == save_buffer.buffer_size:
                    save_buffer.is_from_start = False
                    save_one_buffer(args,
                                    save_buffer,
                                    env_name,
                                    from_start=True)
                if save_buffer.buffer_index % args.save_buffer_interval == 0:
                    print('current episodes_in_buffer: ',
                          save_buffer.episodes_in_buffer)

        for _ in range(args.num_circle):
            if buffer.can_sample(args.batch_size):
                episode_sample = buffer.sample(args.batch_size)

                if args.is_batch_rl:
                    runner.t_env += int(
                        th.sum(episode_sample['filled']).cpu().clone().detach(
                        ).numpy()) // args.batch_size

                # Truncate batch to only filled timesteps
                max_ep_t = episode_sample.max_t_filled()
                episode_sample = episode_sample[:, :max_ep_t]

                if episode_sample.device != args.device:
                    episode_sample.to(args.device)

                learner.train(episode_sample, runner.t_env, episode)

                if args.env == 'mmdp_game_1' and args.learner == "q_learner_exp":
                    for i in range(int(learner.target_gap) - 1):
                        episode_sample = buffer.sample(args.batch_size)

                        # Truncate batch to only filled timesteps
                        max_ep_t = episode_sample.max_t_filled()
                        episode_sample = episode_sample[:, :max_ep_t]

                        if episode_sample.device != args.device:
                            episode_sample.to(args.device)

                        learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)
        if args.env == 'mmdp_game_1' and \
                (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size):
            ### demo
            episode_sample = cp.deepcopy(buffer.sample(1))
            for i in range(args.n_actions):
                for j in range(args.n_actions):
                    new_actions = th.Tensor([i, j]).unsqueeze(0).repeat(
                        args.episode_limit + 1, 1)
                    if i == 0 and j == 0:
                        rew = th.Tensor([
                            1,
                        ])
                    else:
                        rew = th.Tensor([
                            0,
                        ])
                    if i == 1 and j == 1:
                        new_obs = th.Tensor(
                            [1, 0]).unsqueeze(0).unsqueeze(0).repeat(
                                args.episode_limit, args.n_agents, 1)
                    else:
                        new_obs = th.Tensor(
                            [0, 1]).unsqueeze(0).unsqueeze(0).repeat(
                                args.episode_limit, args.n_agents, 1)
                    # Truncate batch to only filled timesteps
                    max_ep_t = episode_sample.max_t_filled()
                    episode_sample = episode_sample[:, :max_ep_t]
                    episode_sample['actions'][0, :, :, 0] = new_actions
                    episode_sample['obs'][0, 1:, :, :] = new_obs
                    episode_sample['reward'][0, 0, 0] = rew
                    new_actions_onehot = th.zeros(
                        episode_sample['actions'].squeeze(3).shape +
                        (args.n_actions, ))
                    new_actions_onehot = new_actions_onehot.scatter_(
                        3, episode_sample['actions'].cpu(), 1)
                    episode_sample['actions_onehot'][:] = new_actions_onehot

                    if episode_sample.device != args.device:
                        episode_sample.to(args.device)

                    #print("action pair: %d, %d" % (i, j))
                    learner.train(episode_sample,
                                  runner.t_env,
                                  episode,
                                  show_demo=True,
                                  save_data=(i, j))
            last_demo_T = runner.t_env
            #time.sleep(1)

        if (args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3') and \
                (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size):
            ### demo
            episode_sample = cp.deepcopy(buffer.sample(1))
            for i in range(args.n_actions):
                for j in range(args.n_actions):
                    new_actions = th.Tensor([i, j]).unsqueeze(0).repeat(
                        args.episode_limit + 1, 1)
                    # Truncate batch to only filled timesteps
                    max_ep_t = episode_sample.max_t_filled()
                    episode_sample = episode_sample[:, :max_ep_t]
                    episode_sample['actions'][0, :, :, 0] = new_actions
                    new_actions_onehot = th.zeros(
                        episode_sample['actions'].squeeze(3).shape +
                        (args.n_actions, )).cuda()
                    new_actions_onehot = new_actions_onehot.scatter_(
                        3, episode_sample['actions'].cuda(), 1)
                    episode_sample['actions_onehot'][:] = new_actions_onehot
                    if i == 0 and j == 0:
                        rew = th.Tensor([
                            8,
                        ])
                    elif i == 0 or j == 0:
                        rew = th.Tensor([
                            -12,
                        ])
                    else:
                        rew = th.Tensor([
                            0,
                        ])
                    if args.env == 'matrix_game_3':
                        if i == 1 and j == 1 or i == 2 and j == 2:
                            rew = th.Tensor([
                                6,
                            ])
                    episode_sample['reward'][0, 0, 0] = rew

                    if episode_sample.device != args.device:
                        episode_sample.to(args.device)

                    #print("action pair: %d, %d" % (i, j))
                    learner.train(episode_sample,
                                  runner.t_env,
                                  episode,
                                  show_demo=True,
                                  save_data=(i, j))
            last_demo_T = runner.t_env
            #time.sleep(1)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            if args.double_q:
                os.makedirs(save_path + '_x', exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run * args.num_circle

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    if args.is_save_buffer and save_buffer.is_from_start:
        save_buffer.is_from_start = False
        save_one_buffer(args, save_buffer, env_name, from_start=True)

    runner.close_env()
    logger.console_logger.info("Finished Training")
Пример #4
0
def run_sequential(args, logger):
    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.episode_limit = env_info["episode_limit"]

    # Default/Base scheme
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (1,), "group": "agents", "dtype": th.long},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "reward": {"vshape": (1,)},
        "terminated": {"vshape": (1,), "dtype": th.uint8},
        "battle_won": {"vshape": (1,), "dtype": th.uint8},
    }
    groups = {
        "agents": args.n_agents
    }
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device,
                          save_episodes=True if args.save_episodes else False,
                          episode_dir=args.episode_dir,
                          clear_existing_episodes=args.clear_existing_episodes)  # TODO maybe just pass args

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    # Model learner
    model_learner = None
    model_buffer = None
    if args.model_learner:
        model_learner = le_REGISTRY[args.model_learner](mac, scheme, logger, args)
        model_buffer = ReplayBuffer(scheme, groups, args.model_buffer_size, buffer.max_seq_length,
                                    preprocess=preprocess,
                                    device="cpu" if args.buffer_cpu_only else args.device,
                                    save_episodes=False)

    if args.use_cuda:
        learner.cuda()
        if model_learner:
            model_learner.cuda()

    if args.checkpoint_path != "":
        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path))
            return

        timestep_to_load = 0
        if args.rl_checkpoint:
            rl_timesteps = []

            # Go through all files in args.checkpoint_path
            for name in os.listdir(args.checkpoint_path):
                full_name = os.path.join(args.checkpoint_path, name)
                # Check if they are dirs the names of which are numbers
                name = name.replace('rl_', '')
                if os.path.isdir(full_name) and name.isdigit():
                    rl_timesteps.append(int(name))

            load_step = int(args.load_step.replace('rl_', '')) if isinstance(args.load_step, str) else args.load_step
            if load_step == 0:
                # choose the max timestep
                timestep_to_load = max(rl_timesteps)
            else:
                # choose the timestep closest to load_step
                timestep_to_load = min(rl_timesteps, key=lambda x: abs(x - load_step))
                model_path = os.path.join(args.checkpoint_path, f"rl_{timestep_to_load}")

        else:
            timesteps = []

            # Go through all files in args.checkpoint_path
            for name in os.listdir(args.checkpoint_path):
                full_name = os.path.join(args.checkpoint_path, name)
                # Check if they are dirs the names of which are numbers
                if os.path.isdir(full_name) and name.isdigit():
                    timesteps.append(int(name))

            if args.load_step == 0:
                # choose the max timestep
                timestep_to_load = max(timesteps)
            else:
                # choose the timestep closest to load_step
                timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

                model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner, buffer)
            return

        # TODO checkpoints for model_learner

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    # new stuff
    collect_episodes = True
    collected_episodes = 0
    train_rl = False
    rl_iterations = 0
    model_trained = False
    n_model_trained = 0
    last_rl_T = 0
    rl_model_save_time = 0

    logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max))
    while runner.t_env <= args.t_max:

        if model_learner:
            if collect_episodes:
                episode_batch = runner.run(test_mode=False)  # collect real episode to progress t_env
                print(f"Collecting {args.batch_size_run} episodes from REAL ENV using epsilon: {runner.mac.env_action_selector.epsilon:.2f}, t_env: {runner.t_env}, collected episodes: {collected_episodes}")
                buffer.insert_episode_batch(episode_batch)
                collected_episodes += args.batch_size_run

            n_collect = args.model_n_collect_episodes if model_trained else args.model_n_collect_episodes_initial
            if collected_episodes >= n_collect:
                print(f"Collected {collected_episodes} REAL episodes, training ENV model")
                # stop collection and train model
                collect_episodes = False
                collected_episodes = 0
                model_learner.train(buffer, runner.t_env, plot_test_results=False)
                model_trained = True
                n_model_trained += 1
                train_rl = True

                if args.model_rollout_before_rl:
                    print(f"Generating {args.model_rollouts} MODEL episodes")
                    rollouts = 0
                    rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size)
                    while rollouts < args.model_rollouts:
                        model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations)
                        model_buffer.insert_episode_batch(model_batch)
                        rollouts += rollout_batch_size

            if train_rl: # and model_buffer.can_sample(args.batch_size):

                # generate synthetic episodes under current policy
                if not args.model_rollout_before_rl:
                    print(f"Generating {args.model_rollouts} MODEL episodes")
                    rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size)
                    model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations)
                    model_buffer.insert_episode_batch(model_batch)

                if model_buffer.can_sample(args.batch_size):
                    for _ in range(args.model_rl_iterations_per_generated_sample):
                        episode_sample = model_buffer.sample(args.batch_size)

                        # truncate batch to only filled timesteps
                        max_ep_t = episode_sample.max_t_filled()
                        episode_sample = episode_sample[:, :max_ep_t]

                        if episode_sample.device != args.device:
                            episode_sample.to(args.device)

                        # train RL agent
                        learner.train(episode_sample, runner.t_env, rl_iterations)
                        rl_iterations += 1
                        print(f"Model RL iteration {rl_iterations}, t_env: {runner.t_env}")

            if not collect_episodes and rl_iterations > 0 and rl_iterations % args.model_update_interval == 0:
                if args.max_model_trained == 0 or args.max_model_trained and n_model_trained < args.max_model_trained:
                    print(f"Time to update model")
                    collect_episodes = True
                    train_rl = False

            # update stats
            model_learner.log_stats(runner.t_env)
            if (runner.t_env - last_log_T) >= args.log_interval:
                logger.log_stat("model_rl_iterations", rl_iterations, runner.t_env)
            if (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0):
                print(f"Logging rl stats")
                model_learner.log_rl_stats(rl_iterations)

        else:
            episode_batch = runner.run(test_mode=False)
            buffer.insert_episode_batch(episode_batch)
            if args.save_episodes and args.save_policy_outputs and args.runner == "episode":
                mac.save_policy_outputs()
            if buffer.can_sample(args.batch_size):
                for _ in range(args.batch_size_run):
                    episode_sample = buffer.sample(args.batch_size)

                    # Truncate batch to only filled timesteps
                    max_ep_t = episode_sample.max_t_filled()
                    episode_sample = episode_sample[:, :max_ep_t]

                    if episode_sample.device != args.device:
                        episode_sample.to(args.device)

                    learner.train(episode_sample, runner.t_env, episode)
                    rl_iterations += 1
                    print(f"RL iteration {rl_iterations}, t_env: {runner.t_env}")

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if ((runner.t_env - last_test_T) / args.test_interval >= 1.0) or (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0):

            print("Running test cases")

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            last_rl_T = rl_iterations
            runner.t_rl = rl_iterations

            for _ in range(n_test_runs):
                runner.run(test_mode=True)

            logger.print_recent_stats()

        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        if args.save_model and model_trained and (rl_iterations == 0 or (rl_iterations - rl_model_save_time)/args.rl_save_model_interval >= 1.0):
            print(f"Saving at RL model iteration {rl_iterations}")
            rl_model_save_time = rl_iterations
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, f"rl_{rl_iterations}")
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("rl_iterations", rl_iterations, runner.t_env)
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Пример #5
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    to_index_flag = False
    if hasattr(args, 'to_index_flag'):
        if args.to_index_flag:
            to_index_flag = True
    # Set up schemes and groups here
    env_info = runner.get_env_info()
    # if args.disc_state:
    #     if args.env_args["map_name"] == '3m':
    #         state_num = 1077
    #         if to_index_flag:
    #             pass
    #         else:
    #             state_shape = state_num
    #     elif args.env_args["map_name"] == 'corridor':
    #         state_num = 5280
    #         if to_index_flag:
    #             pass
    #         else:
    #             state_shape = state_num
    #     elif args.env_args["map_name"] == '6h_vs_8z':
    #         state_num = 2884
    #         if to_index_flag:
    #             state_shape = 62
    #         else:
    #             state_shape = state_num
    #     elif args.env_args["map_name"] == '2s3z':
    #         state_num = 2325
    #         # state_num = 165
    #         if to_index_flag:
    #             state_shape = 20
    #         else:
    #             state_shape = state_num
    #     else:
    #         raise NotImplementedError
    # else:
    state_shape = env_info["state_shape"]
    state_num = env_info.get("state_num", None)
    # TEST
    # if args.env_args["map_name"] == '2s3z':
    #     state_shape = 120
    #     state_num = state_shape
    # state_shape = env_info["state_shape"]
    # state_num = env_info.get("state_num", None)
    # state_num = state_shape
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = state_shape
    args.state_num = state_num
    args.all_obs = env_info.get("all_obs", None)

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": state_shape
        },
        # "state": {"vshape": state_num},  # TEST
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
        "noise": {
            "vshape": (args.noise_dim, )
        }
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()
        runner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1

    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    # min_training_interval
    # training_interval_count = 0.0
    # episode_limit = env_info["episode_limit"]
    last_train_T = -env_info["episode_limit"] - 1
    # args.env_args.episode_limit
    # train_intervel_step = 0
    training_times = 0

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        time_stamp = time.time()

        episode_batch = runner.run(test_mode=False)
        buffer.insert_episode_batch(episode_batch)

        time_stamp = time_spent(time_stamp, 'Sampling')

        if buffer.can_sample(args.batch_size):
            if (runner.t_env -
                    last_train_T) / env_info["episode_limit"] >= 0.9:
                episode_sample = buffer.sample(args.batch_size)

                # Truncate batch to only filled timesteps
                training_times += 1
                logger.console_logger.info(
                    "t_env: {} / training_times {}".format(
                        runner.t_env, training_times))
                # print('training_times', training_times)
                max_ep_t = episode_sample.max_t_filled()
                episode_sample = episode_sample[:, :max_ep_t]

                if episode_sample.device != args.device:
                    episode_sample.to(args.device)

                time_stamp = time.time()

                learner.train(episode_sample, runner.t_env, episode)
                last_train_T = runner.t_env

                time_stamp = time_spent(time_stamp, 'Training')

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

            if args.noise_bandit:
                for _ in range(n_test_runs):
                    runner.run(test_mode=True, test_uniform=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.results_path, "models",
                                     args.unique_token, str(runner.t_env))
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)
            runner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Пример #6
0
def run_sequential(args, logger):
    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]

    args.save_model = True  # 需要从外部设置

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    # ---------------------------------
    # 设置 multi-agent controller
    # ---------------------------------
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    # -------------------------
    # 如果 checkpoint_path 不为空,那么需要首先从 checkpoint_path 加载模型
    # -------------------------
    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        #
        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        # ----------------------------
        # 从本地加载模型
        # 1. 设置模型路径: args.checkpoint_path 对应 config/default.yaml 中的 checkpoint_path 配置项
        # 2. 加载模型
        # ----------------------------
        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        # ------------------------------
        # 如果 default.yaml 中 cal_max_expectation_tasks 参数为 true, 表示需要使用已经训练好的最优模型来进行最大期望任务量的计算,而不进行模型的训练。
        # ------------------------------
        if args.cal_max_expectation_tasks:
            cal_max_expectation_tasks(args, mac, learner, runner)
            return

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    global_reward = []
    global_state = []
    file_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output",
                             "train_reward.txt")
    state_path = os.path.join(os.path.dirname(__file__), "envs", "ec",
                              "output", "train_state.txt")

    test_state = []
    test_reward = []
    test_state_path = os.path.join(os.path.dirname(__file__), "envs", "ec",
                                   "output", "test_state.txt")
    test_reward_path = os.path.join(os.path.dirname(__file__), "envs", "ec",
                                    "output", "test_reward.txt")

    while runner.t_env <= args.t_max:  # t_env ?

        # Run for a whole episode at a time
        episode_batch = runner.run(
            test_mode=False)  # runner.run() 返回的是一个回合的数据。
        global_reward += get_episode_reward(
            episode_batch.data.transition_data)  # 将每一个 step 的 reward 都记录下来
        global_state += get_episode_state(
            episode_batch.data.transition_data)  # 将每个 step 的 state 都记录下来

        # 保存测试模式下的 state, reward 数据。 隔 args.reward_period 进行测试,测试的 state 数量为 args.reward_period。
        if runner.t_env % args.reward_period == 0:
            print(
                "---------------------------------测试模式中-----------------------------------------"
            )
            for i in range(int(args.reward_period / 20)):
                episode_data = runner.run(test_mode=True)  # 执行测试模式
                test_state += get_episode_state(
                    episode_data.data.transition_data)
                test_reward += get_episode_reward(
                    episode_data.data.transition_data)

        buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # Truncate batch to only filled timesteps
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    save_state_reward(state_path, global_state)
    save_state_reward(file_path, global_reward)
    save_state_reward(test_state_path, test_state)
    save_state_reward(test_reward_path, test_reward)
    logger.console_logger.info("Finished Training")
Пример #7
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Setup schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]

    # Default/base scheme
    reward_dict = {"vshape": (1,), "group": "agents", "dtype": th.float32} if args.env_args["reward_local"] else {"vshape": (1,)}
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (1,), "group": "agents", "dtype": th.long},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "reward": reward_dict,
        "terminated": {"vshape": (1,), "dtype": th.uint8},
    }
    # TODO: what is groups controlling?
    groups = {
        "agents": args.n_agents
    }
    # TODO: where/how is pre processing applied?
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    # TODO: why create replaybuffer with episode limit + 1?
    # Setup replaybuffer
    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multi-agent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Setup runner with created scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Setup learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    # Activate CUDA
    if args.use_cuda:
        learner.cuda()

    # Load checkpoint if necessary
    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        # Check checkpoint path integrity -> exist or else no model can be loaded later
        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directory {} doesn't exist".format(args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        # TODO: enforce learner loading correct model?
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    #
    # Start training
    #
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max))

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time -> runner returns a episode batch
        episode_batch = runner.run(test_mode=False)
        # Save episode in replay buffer
        buffer.insert_episode_batch(episode_batch)

        # If enough episodes saved -> sample
        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # Truncate batch to only filled timesteps
            # TODO: explain max_t_filled
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            # TODO: when is device differing?!
            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            # Train on sampled episodes
            learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        # Save model after certain time
        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        # Increase total episode counter by batch size of episodes currently run
        # TODO: follow batch_size_run!
        episode += args.batch_size_run

        # Log stats in interval
        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Пример #8
0
def run_sequential(args, logger):
    """
    真正运行函数
    :param args:
    :type args:
    :param logger:
    :type logger:
    :return:
    :rtype:
    """
    # init runner所以我们可以得到env info, 运行哪个runner,是src/runners/parallel_runner.py中的ParallelRunner  还是episode_runner.py
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # 在此设置schemes和groups
    env_info = runner.get_env_info()
    # agent的数量 eg: 8
    args.n_agents = env_info["n_agents"]
    # 动作的数量 eg: 6
    args.n_actions = env_info["n_actions"]
    # agent状态的维度: 300
    args.state_shape = env_info["state_shape"]

    if getattr(args, 'agent_own_state_size', False):
        args.agent_own_state_size = get_agent_own_state_size(args.env_args)

    # 自定义schema
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (1,), "group": "agents", "dtype": th.long},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "probs": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.float},
        "reward": {"vshape": (1,)},
        "terminated": {"vshape": (1,), "dtype": th.uint8},
    }
    groups = {
        "agents": args.n_agents
    }
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }
    # 重放buffer
    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device)
    # 在此设置多agent控制器,调用src/controllers/n_controller.py中的NMAC函数
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # 给runner这个schema
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner, 调用src/learners/nq_learner.py下的NQLearner初始化, 不同的算法初始化
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":
        # 加载checkpoint,继续训练
        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path))
            return

        # 遍历args.checkpoint_path中的所有文件
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # 检查它们是否是Dirs的名称是数字
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # 开始训练
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("开始训练,训练的 {} 个时间步".format(args.t_max))

    while runner.t_env <= args.t_max:

        # 一个时间步运行一个episode

        with th.no_grad():
            episode_batch = runner.run(test_mode=False)
            buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # 截断批次只保留有时间步的
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            learner.train(episode_sample, runner.t_env, episode)
            del episode_sample

        # 执行测试运行一次
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("完成训练")
Пример #9
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)
    th.autograd.set_detect_anomaly(True)
    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.obs_shape = env_info["obs_shape"]

    #    args.own_feature_size = env_info["own_feature_size"] #unit_type_bits+shield_bits_ally
    #if args.obs_last_action:
    #    args.own_feature_size+=args.n_actions
    #if args.obs_agent_id:
    #    args.own_feature_size+=args.n_agents

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
    }
    if args.learner == "hierarchical_rode_learner":
        scheme.update({
            "role_avail_actions": {
                "vshape": (env_info["n_actions"], ),
                "group": "agents",
                "dtype": th.int
            },
            "roles": {
                "vshape": (1, ),
                "group": "agents",
                "dtype": th.long
            }
        })
    if args.learner == "hierarchical_noise_q_learner":
        scheme.update({"noise": {"vshape": (args.noise_dim, )}})
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    if args.q_net_ensemble:
        mac = [
            mac_REGISTRY[args.mac](buffer.scheme, groups, args)
            for _ in range(args.ensemble_num)
        ]
    else:
        mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()
        if args.runner == "meta_noise":
            runner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    if args.meta_h:
        last_meta_T = -args.meta_h_interval - 1
        meta_buffer = ReplayBuffer(
            scheme,
            groups,
            args.batch_size,
            env_info["episode_limit"] + 1,
            preprocess=preprocess,
            device="cpu" if args.buffer_cpu_only else args.device)
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))
    use_rode = True if args.learner == "hierarchical_rode_learner" else False
    meta_start_t = 0
    if args.learner == "hierarchical_rode_learner":
        meta_start_t = args.role_action_spaces_update_start
    if args.save_batch_interval > 0:
        last_save_batch = -args.save_batch_interval - 1
    whole_q_list = []
    if args.save_q_all:
        q_list_ind = 0
    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        # if args.meta_h:
        #     episode_batch, batch_log_p, mean_step_returns = runner.run(test_mode=False, meta_mode=True)
        # else:
        #     episode_batch, _ = runner.run(test_mode=False) #[8,181,10,1] for actions
        episode_batch, _ = runner.run(
            test_mode=False, use_rode=use_rode)  #[8,181,10,1] for actions
        buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size) and args.meta_h and \
            (runner.t_env - last_meta_T) / args.meta_h_interval >= 1.0 and runner.t_env >= meta_start_t:
            repeat_times = args.batch_size // runner.batch_size
            # meta_buffer.insert_episode_batch(episode_batch)
            batch_log_p_all = []
            mean_step_returns_all = []
            for _ in range(repeat_times):
                #[8]
                # episode_batch, batch_log_p, mean_step_returns = runner.run_meta(test_mode=False, meta_mode=True)
                # batch_log_p_all.append(batch_log_p)
                episode_batch, _, mean_step_returns = runner.run_meta(
                    test_mode=False, meta_mode=True, use_rode=use_rode)
                mean_step_returns_all += mean_step_returns
                buffer.insert_episode_batch(episode_batch[0])
                meta_buffer.insert_episode_batch(episode_batch)
            #[32]
            # batch_log_p_all = th.cat(batch_log_p_all, dim=0)
            for _ in range(repeat_times):
                episode = prep_ep_and_train(meta_buffer, args, learner,
                                            episode, runner.t_env,
                                            whole_q_list)
            mean_step_returns_new_all = []
            for _ in range(repeat_times):
                episode_batch_new, mean_step_returns_new = runner.run_meta(
                    test_mode=False, use_rode=use_rode)
                buffer.insert_episode_batch(episode_batch_new[0])
                mean_step_returns_new_all += mean_step_returns_new
            #need to get batch_log_p_here
            batch_log_p_all = runner.get_log_p(meta_buffer)
            learner.train_meta(batch_log_p_all, mean_step_returns_all,
                               mean_step_returns_new_all, runner.t_env)
            for _ in range(repeat_times):
                episode = prep_ep_and_train(buffer, args, learner, episode,
                                            runner.t_env, whole_q_list)
            last_meta_T = runner.t_env
        elif buffer.can_sample(args.batch_size):
            prep_ep_and_train(buffer, args, learner, episode, runner.t_env,
                              whole_q_list)
            # episode_sample = buffer.sample(args.batch_size) #[32,181,10,1] for actions

            # # Truncate batch to only filled timesteps
            # max_ep_t = episode_sample.max_t_filled()
            # episode_sample = episode_sample[:, :max_ep_t]

            # if episode_sample.device != args.device:
            #     episode_sample.to(args.device)

            # learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            save_batch_flag = False
            discount = 1.0 if args.t_max // 5 <= runner.t_env else 10.0
            if args.save_batch_interval > 0 and (
                    runner.t_env - last_save_batch) / (
                        args.save_batch_interval // discount) >= 1.0:
                save_batch_flag = True
                last_save_batch = runner.t_env
            for i in range(n_test_runs):
                if args.runner == "meta" or args.runner == "meta_noise":
                    runner.run_meta(test_mode=True, use_rode=use_rode)
                else:
                    runner.run(test_mode=True, use_rode=use_rode)
                if save_batch_flag:
                    save_batch(runner.batch, osp.join(args.tb_logs, "batch"),
                               runner.t_env, i)
            if args.noise_bandit:
                for _ in range(n_test_runs):
                    runner.run_meta(test_mode=True, test_uniform=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run if args.runner != "meta" and args.runner != "meta_noise" else 1

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env
        if args.save_q_all and len(whole_q_list) >= 4000:
            save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind)
            whole_q_list.clear()
            q_list_ind += 1

    if args.save_q_all and len(whole_q_list) > 0:
        save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind)
    runner.close_env()
    logger.console_logger.info("Finished Training")
Пример #10
0
def run_sequential(args, logger):

    # Init runner(episode runner or parallel runner) so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]  # from smac maps
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    # args.unit_type_bits = env_info["unit_type_bits"]
    # args.shield_bits_ally = env_info["shield_bits_ally"]
    # args.shield_bits_enemy = env_info["shield_bits_enemy"]
    # args.n_enemies = env_info["n_enemies"]

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
        #"policy": {"vshape": (env_info["n_agents"],)}
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)
    off_buffer = ReplayBuffer(
        scheme,
        groups,
        args.off_buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    runner.set_learner(learner)

    ###### If checkpoint_path is given, and if args.evaluate == True or args.save_replay == True,
    ###### then, this function is returned without training.
    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                f"Checkpoint directiory {args.checkpoint_path} doesn't exist")
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info(f"Loading model from {model_path}")
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return
    ########################################################################################################

    ######## start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info(
        f"Beginning training for {args.t_max} timesteps")

    while runner.t_env <= args.t_max:

        # critic running log
        running_log = {
            "critic_loss": [],
            "critic_grad_norm": [],
            "td_error_abs": [],
            "target_mean": [],
            "q_taken_mean": [],
            "q_max_mean": [],
            "q_min_mean": [],
            "q_max_var": [],
            "q_min_var": []
        }

        # Run for a whole episode at a time
        episode_batch = runner.run(test_mode=False)
        buffer.insert_episode_batch(episode_batch)
        off_buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size) and off_buffer.can_sample(
                args.off_batch_size):
            #train critic normall
            uni_episode_sample = buffer.uni_sample(args.batch_size)
            off_episode_sample = off_buffer.uni_sample(args.off_batch_size)
            max_ep_t = max(uni_episode_sample.max_t_filled(),
                           off_episode_sample.max_t_filled())
            uni_episode_sample = process_batch(
                uni_episode_sample[:, :max_ep_t], args)
            off_episode_sample = process_batch(
                off_episode_sample[:, :max_ep_t], args)
            learner.train_critic(uni_episode_sample,
                                 best_batch=off_episode_sample,
                                 log=running_log)

            #train actor
            episode_sample = buffer.sample_latest(args.batch_size)
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = process_batch(episode_sample[:, :max_ep_t], args)
            learner.train(episode_sample, runner.t_env, running_log)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info(f"t_env: {runner.t_env} / {args.t_max}")
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        # Save model every {save_model_interval} timesteps
        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info(f"Saving models to {save_path}")

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")