Exemplo n.º 1
0
def main(_config, _run):
    config = convert(_config)
    _id = _run._id

    # Logging stuff
    logger = logging.getLogger("Main")
    if config.mongo:
        logging.disable(logging.WARNING)
    configure_stats_logging(
        str(_id) + "_" + config.name,
        log_interval=config.log_interval,
        sacred_info=_run.info,
        use_tb=config.tb,
    )
    stats = get_stats()

    logger.critical("ID: {}".format(_id))
    # Update config with environment specific information
    env = gym.make(config.env)
    num_actions = env.action_space.n
    config = config._replace(num_actions=num_actions)
    state_shape = env.observation_space.shape
    config = config._replace(state_shape=state_shape)
    # Wrap env
    env = EnvWrapper(env, debug=True, args=config)

    # Log the config
    config_str = "Config:\n\n"
    for k, v in sorted(config._asdict().items()):
        config_str += "     {}: {}\n".format(k, v)
    logger.critical(config_str)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.critical("Device: {}".format(device.type))

    # Make agent and target agent
    agent = get_model(config.agent)(config)
    target_agent = get_model(config.agent)(config)
    target_agent.load_state_dict(agent.state_dict())
    agent.to(device)
    target_agent.to(device)

    # Pseudocount stuff
    count_model = None
    if config.count_rewards:
        if config.atari_count:
            count_model = AtariCount(config)
        elif config.rnd_net_count:
            # assert config.count_state_only_rewards
            count_model = RndNetworkDistill(config, device)
        elif config.dora_count:
            count_model = DoraCount(config, device)
        else:
            count_model = PseudoCount(config)

    # Make action selector
    action_selector = None
    if config.action_selector == "eps_greedy":
        action_selector = eps_greedy.EpsGreedy(config)
    elif config.action_selector == "optimistic_action":
        action_selector = optimistic_action.OptimisticAction(
            count_model, config)
    elif config.action_selector == "bsp":
        action_selector = bsp_action.BSPAction(config)
    else:
        raise Exception("{} is not an Action Selector!".format(
            config.action_selector))

    # Make replay buffer
    # Check if the obs dtype of the environment is an int
    obs_dtype = getattr(env.wrapped_env, "obs_dtype", np.float32)
    obs_scaling = getattr(env.wrapped_env, "obs_scaling", 1)
    replay_buffer = ReplayBuffer(size=config.buffer_size,
                                 frame_history_len=config.past_frames_input,
                                 obs_dtype=obs_dtype,
                                 obs_scaling=obs_scaling,
                                 args=config)

    if config.dora_count:
        dora_buffer = ReplayBuffer(size=config.batch_size * 4,
                                   frame_history_len=config.past_frames_input,
                                   obs_dtype=obs_dtype,
                                   obs_scaling=obs_scaling,
                                   args=config)

    # Make trainer
    trainer = None
    if config.trainer == "DQN":
        trainer = DQNTrainer(agent=agent,
                             target_agent=target_agent,
                             args=config,
                             count_model=count_model,
                             buffer=replay_buffer)
    else:
        raise Exception
    testing_buffer = ReplayBuffer(size=(config.past_frames_input + 1),
                                  frame_history_len=config.past_frames_input,
                                  args=config)

    # Testing stuff
    testing_env = EnvWrapper(env=gym.make(config.env), debug=True, args=config)
    if config.test_augmented:
        assert config.action_selector == "optimistic_action"

    # Player Positions
    positions = set()
    action_positions = set()

    T = 0
    start_time = time.time()
    last_time = start_time

    # Lots of code duplication :(
    logging.critical("Filling buffer with {:,} random experiences.".format(
        config.buffer_burn_in))
    state = env.reset()
    assert config.buffer_burn_in == 0
    for t in range(config.buffer_burn_in):
        buffer_idx = replay_buffer.store_frame(state)
        stacked_states = replay_buffer.encode_recent_observation()
        tensor_state = torch.tensor(stacked_states, device=device).unsqueeze(0)
        action = np.random.randint(config.num_actions)
        next_state, reward, terminated, info = env.step(action)
        terminal_to_store = terminated
        if "Steps_Termination" in info and info["Steps_Termination"]:
            terminal_to_store = False

        intrinsic_reward = 0
        pseudo_count = 0
        if config.count_rewards:
            pseudo_count = count_model.visit(tensor_state, action)
            if getattr(count_model, "reward_directly", False):
                intrinsic_reward = pseudo_count
            else:
                count_bonus = config.count_beta / sqrt(pseudo_count)
                intrinsic_reward = count_bonus

        replay_buffer.store_effect(buffer_idx, action,
                                   reward - config.reward_baseline,
                                   intrinsic_reward, terminal_to_store,
                                   pseudo_count)
        state = next_state
        if terminated:
            state = env.reset()
            logger.warning("Random action burn in t: {:,}".format(t))

    state = env.reset()
    episode = 0
    episode_reward = 0
    intrinsic_episode_reward = 0
    episode_length = 0
    env_positive_reward = 0
    max_episode_reward = 0
    if config.bsp:
        bsp_k = np.random.randint(config.bsp_k)
        action_selector.update_k(bsp_k)

    logging.critical("Beginning training.")

    while T < config.t_max:

        # Store the current state
        buffer_idx = replay_buffer.store_frame(state)
        if config.dora_count:
            dora_idx = dora_buffer.store_frame(state)

        # Get the stacked input vector
        stacked_states = replay_buffer.encode_recent_observation()

        # Get output from agent
        with torch.no_grad():
            tensor_state = torch.tensor(stacked_states,
                                        device=device).unsqueeze(0)
            agent_output = agent(tensor_state)
            # agent_output = agent(torch.Tensor(stacked_states).unsqueeze(0))

        # Select action
        action, action_info = action_selector.select_actions(
            agent_output, T, info={"state": tensor_state})

        # Take an environment step
        next_state, reward, terminated, info = env.step(action)
        T += 1
        stats.update_t(T)
        episode_reward += reward
        episode_length += 1
        terminal_to_store = terminated
        if "Steps_Termination" in info and info["Steps_Termination"]:
            logger.warning("Terminating because of episode limit")
            terminal_to_store = False

        # Log if a positive reward was ever received from environment. ~Finding goal
        if reward > 0.1:
            env_positive_reward = 1
        stats.update_stats("Positive_Reward", env_positive_reward)

        # Calculate count based intrinsic motivation
        intrinsic_reward = 0
        pseudo_count = 0
        if config.count_rewards:
            pseudo_count = count_model.visit(tensor_state, action)
            if getattr(count_model, "reward_directly", False):
                # The count-model is giving us the intrinsic reward directly
                intrinsic_reward = pseudo_count[0]
            else:
                # Count-model is giving us the pseudo-count
                count_bonus = config.count_beta / sqrt(pseudo_count)
                intrinsic_reward = count_bonus
            intrinsic_episode_reward += intrinsic_reward

        # Render training
        if config.render_train_env:
            debug_info = {}
            debug_info.update(action_info)
            env.render(debug_info=debug_info)

        # Add what happened to the buffer
        replay_buffer.store_effect(buffer_idx, action,
                                   reward - config.reward_baseline,
                                   intrinsic_reward, terminal_to_store,
                                   pseudo_count)
        if config.dora_count:
            dora_buffer.store_effect(dora_idx, action,
                                     reward - config.reward_baseline,
                                     intrinsic_reward, terminal_to_store,
                                     pseudo_count)

        # Update state
        state = next_state

        # If terminated
        if terminated:
            # If we terminated due to episode limit, we need to add the current state in
            if "Steps_Termination" in info and info["Steps_Termination"]:
                buffer_idx = replay_buffer.store_frame(state)
                replay_buffer.store_effect(buffer_idx,
                                           0,
                                           0,
                                           0,
                                           True,
                                           0,
                                           dont_sample=True)
                if config.dora_count:
                    dora_idx = dora_buffer.store_frame(state)
                    dora_buffer.store_effect(dora_idx,
                                             0,
                                             0,
                                             0,
                                             True,
                                             0,
                                             dont_sample=True)

            logger.warning("T: {:,}, Episode Reward: {:.2f}".format(
                T, episode_reward))
            state = env.reset()
            max_episode_reward = max(max_episode_reward, episode_reward)
            stats.update_stats("Episode Reward", episode_reward)
            stats.update_stats("Max Episode Reward", max_episode_reward)
            stats.update_stats("Episode Length", episode_length)
            stats.update_stats("Intrin Eps Reward", intrinsic_episode_reward)
            episode_reward = 0
            episode_length = 0
            intrinsic_episode_reward = 0
            episode += 1
            stats.update_stats("Episode", episode)
            if config.bsp:
                bsp_k = np.random.randint(config.bsp_k)
                action_selector.update_k(bsp_k)

        # Train if possible
        for _ in range(config.training_iters):
            sampled_batch = None

            if T % config.update_freq != 0:
                # Only train every update_freq timesteps
                continue
            if replay_buffer.can_sample(config.batch_size):
                sampled_batch = replay_buffer.sample(config.batch_size,
                                                     nstep=config.n_step)

            if sampled_batch is not None:
                trainer.train(sampled_batch)

            if config.dora_count:
                if dora_buffer.can_sample(config.batch_size):
                    sampled_batch = replay_buffer.sample(config.batch_size,
                                                         nstep=config.n_step)
                if sampled_batch is not None:
                    count_model.train(sampled_batch)

        # Update target networks if necessary
        if T % config.target_update_interval == 0:
            trainer.update_target_agent()
            if config.dora_count:
                count_model.update_target_agent()

        # Logging
        if config.bsp:
            agent_output = agent_output[:, :, bsp_k]
        q_vals_numpy = agent_output.detach().cpu()[0].numpy()
        if num_actions < 20:
            for action_id in range(config.num_actions):
                stats.update_stats("Q-Value_{}".format(action_id),
                                   q_vals_numpy[action_id])
        else:
            stats.update_stats("Q-Value_Mean", np.mean(q_vals_numpy))
        player_pos = env.log_visitation()
        positions.add(player_pos)
        action_positions.add((player_pos, action))
        stats.update_stats("States Visited", len(positions))
        stats.update_stats("State_Actions Visited", len(action_positions))
        stats.update_stats("Player Position", player_pos)
        # Log all env stats returned
        for k, v in info.items():
            if k != "Steps_Termination":
                stats.update_stats(k, v)

        if config.save_count_gifs > 0 and T % config.save_count_gifs == 0:
            if count_model is not None:
                state_action_counts, count_nums = env.count_state_action_space(
                    count_model)
                if state_action_counts is not None:
                    save_image(state_action_counts,
                               image_name="SA_Counts__{}_Size__{}_T".format(
                                   config.count_size, T),
                               direc_name="State_Action_Counts")
                    save_sa_count_vals(count_nums,
                                       name="SA_PCounts__{}_Size__{}_T".format(
                                           config.count_size, T),
                                       direc_name="Sa_Count_Estimates")

                actual_counts = env.state_counts()
                if actual_counts is not None:
                    save_actual_counts(actual_counts,
                                       name="Counts__{}_T".format(T),
                                       direc_name="Actual_Counts")

                q_val_img, q_vals = env.q_value_estimates(count_model, agent)
                if q_val_img is not None:
                    save_image(q_val_img,
                               image_name="Q_Vals__{}_Size__{}_T".format(
                                   config.count_size, T),
                               direc_name="Q_Value_Estimates")
                if q_vals is not None:
                    save_q_vals(q_vals,
                                name="Q_Vals__{}_Size__{}_T".format(
                                    config.count_size, T),
                                direc_name="Q_Value_Estimates")

        # Testing
        with torch.no_grad():
            if T % config.testing_interval == 0:

                prefixes = [""]
                if config.test_augmented:
                    prefixes += ["Aug_"]

                for prefix in prefixes:
                    total_test_reward = 0
                    total_test_length = 0
                    for _ in range(config.test_episodes):
                        test_episode_reward = 0
                        test_episode_length = 0
                        test_state = testing_env.reset()
                        test_env_terminated = False

                        while not test_env_terminated:
                            test_buffer_idx = testing_buffer.store_frame(
                                test_state)
                            stacked_test_states = testing_buffer.encode_recent_observation(
                            )
                            test_tensor_state = torch.tensor(
                                stacked_test_states,
                                device=device).unsqueeze(0)
                            testing_agent_output = agent(test_tensor_state)

                            if prefix == "Aug_" or config.bsp:
                                test_action, _ = action_selector.select_actions(
                                    testing_agent_output,
                                    T,
                                    info={"state": test_tensor_state},
                                    testing=True)
                            else:
                                test_action = get_test_action(
                                    testing_agent_output, config)

                            next_test_state, test_reward, test_env_terminated, _ = testing_env.step(
                                test_action)
                            if config.render_test_env:
                                testing_env.render()

                            test_episode_length += 1
                            test_episode_reward += test_reward

                            testing_buffer.store_effect(
                                test_buffer_idx, test_action, test_reward, 0,
                                test_env_terminated, 0)

                            test_state = next_test_state

                        total_test_length += test_episode_length
                        total_test_reward += test_episode_reward

                    mean_test_reward = total_test_reward / config.test_episodes
                    mean_test_length = total_test_length / config.test_episodes

                    logger.error(
                        "{}Testing -- T: {:,}/{:,}, Test Reward: {:.2f}, Test Length: {:,}"
                        .format(prefix, T, config.t_max, mean_test_reward,
                                mean_test_length))

                    stats.update_stats("{}Test Reward".format(prefix),
                                       mean_test_reward,
                                       always_log=True)
                    stats.update_stats("{}Test Episode Length".format(prefix),
                                       mean_test_length,
                                       always_log=True)

                logger.error("Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, T - config.testing_interval,
                              T, config.t_max),
                    time_str(time.time() - start_time)))
                last_time = time.time()

        if T % (config.log_interval * 4) == 0:
            stats.print_stats()

    logger.critical("Closing envs")
    env.close()
    testing_env.close()

    logger.critical("Finished training.")

    if client is not None:
        logger.critical("Attempting to close pymongo client")
        client.close()
        logger.critical("Pymongo client closed")

    logger.critical("Exiting")
Exemplo n.º 2
0
def run_sequential(args, logger):
    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.obs_shape = env_info["obs_shape"]

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "role_avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
        "roles": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        }
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        episode_batch = runner.run(test_mode=False)
        buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # Truncate batch to only filled timesteps
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 3
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    #args.action_space = env_info["action_space"]
    args.action_spaces = env_info["action_spaces"]
    args.actions_dtype = env_info["actions_dtype"]
    args.normalise_actions = env_info.get("normalise_actions",
                                                False) # if true, action vectors need to sum to one


    # create function scaling agent action tensors to and from range [0,1]
    ttype = th.FloatTensor if not args.use_cuda else th.cuda.FloatTensor
    mult_coef_tensor = ttype(args.n_agents, args.n_actions)
    action_min_tensor = ttype(args.n_agents, args.n_actions)
    if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]):
        for _aid in range(args.n_agents):
            for _actid in range(args.action_spaces[_aid].shape[0]):
                _action_min = args.action_spaces[_aid].low[_actid]
                _action_max = args.action_spaces[_aid].high[_actid]
                mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min)
                action_min_tensor[_aid, _actid] = np.asscalar(_action_min)
    elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]):    # NOTE: This was added to handle scenarios like simple_reference since the action space is Tuple
        for _aid in range(args.n_agents):
            for _actid in range(args.action_spaces[_aid].spaces[0].shape[0]):
                _action_min = args.action_spaces[_aid].spaces[0].low[_actid]
                _action_max = args.action_spaces[_aid].spaces[0].high[_actid]
                mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min)
                action_min_tensor[_aid, _actid] = np.asscalar(_action_min)
            for _actid in range(args.action_spaces[_aid].spaces[1].shape[0]):
                _action_min = args.action_spaces[_aid].spaces[1].low[_actid]
                _action_max = args.action_spaces[_aid].spaces[1].high[_actid]
                tmp_idx = _actid + args.action_spaces[_aid].spaces[0].shape[0]
                mult_coef_tensor[_aid, tmp_idx] = np.asscalar(_action_max - _action_min)
                action_min_tensor[_aid, tmp_idx] = np.asscalar(_action_min)

    args.actions2unit_coef = mult_coef_tensor
    args.actions2unit_coef_cpu = mult_coef_tensor.cpu()
    args.actions2unit_coef_numpy = mult_coef_tensor.cpu().numpy()
    args.actions_min = action_min_tensor
    args.actions_min_cpu = action_min_tensor.cpu()
    args.actions_min_numpy = action_min_tensor.cpu().numpy()

    def actions_to_unit_box(actions):
        if isinstance(actions, np.ndarray):
            return args.actions2unit_coef_numpy * actions + args.actions_min_numpy
        elif actions.is_cuda:
            return args.actions2unit_coef * actions + args.actions_min
        else:
            return args.args.actions2unit_coef_cpu  * actions + args.actions_min_cpu

    def actions_from_unit_box(actions):
        if isinstance(actions, np.ndarray):
            return th.div((actions - args.actions_min_numpy), args.actions2unit_coef_numpy)
        elif actions.is_cuda:
            return th.div((actions - args.actions_min), args.actions2unit_coef)
        else:
            return th.div((actions - args.actions_min_cpu), args.actions2unit_coef_cpu)

    # make conversion functions globally available
    args.actions2unit = actions_to_unit_box
    args.unit2actions = actions_from_unit_box

    action_dtype = th.long if not args.actions_dtype == np.float32 else th.float
    if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]):
        actions_vshape = 1 if not args.actions_dtype == np.float32 else max([i.shape[0] for i in args.action_spaces])
    elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]):
        actions_vshape = 1 if not args.actions_dtype == np.float32 else \
                                       max([i.spaces[0].shape[0] + i.spaces[1].shape[0] for i in args.action_spaces])
    # Default/Base scheme
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (actions_vshape,), "group": "agents", "dtype": action_dtype},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "reward": {"vshape": (1,)},
        "terminated": {"vshape": (1,), "dtype": th.uint8},
    }
    groups = {
        "agents": args.n_agents
    }

    if not args.actions_dtype == np.float32:
        preprocess = {
            "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
        }
    else:
        preprocess = {}

    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1 if args.runner_scope == "episodic" else 2,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = - args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max))

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        if getattr(args, "runner_scope", "episodic") == "episodic":
            episode_batch = runner.run(test_mode=False, learner=learner)
            buffer.insert_episode_batch(episode_batch)

            if buffer.can_sample(args.batch_size) and (buffer.episodes_in_buffer > getattr(args, "buffer_warmup", 0)):
                episode_sample = buffer.sample(args.batch_size)

                # Truncate batch to only filled timesteps
                max_ep_t = episode_sample.max_t_filled()
                episode_sample = episode_sample[:, :max_ep_t]

                if episode_sample.device != args.device:
                    episode_sample.to(args.device)

                learner.train(episode_sample, runner.t_env, episode)
        elif getattr(args, "runner_scope", "episode") == "transition":
            runner.run(test_mode=False,
                       buffer=buffer,
                       learner=learner,
                       episode=episode)
        else:
            raise Exception("Undefined runner scope!")

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            if getattr(args, "testing_on", True):
                for _ in range(n_test_runs):
                    if getattr(args, "runner_scope", "episodic") == "episodic":
                        runner.run(test_mode=True, learner=learner)
                    elif getattr(args, "runner_scope", "episode") == "transition":
                        runner.run(test_mode=True,
                                   buffer = buffer,
                                   learner = learner,
                                   episode = episode)
                    else:
                        raise Exception("Undefined runner scope!")

        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            # learner.save_models(save_path, args.unique_token, model_save_time)

            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 4
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.episode_limit = env_info["episode_limit"]
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.unit_dim = env_info["unit_dim"]

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    env_name = args.env
    if env_name == 'sc2':
        env_name += '/' + args.env_args['map_name']

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        args.burn_in_period,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    if args.is_save_buffer:
        save_buffer = ReplayBuffer(
            scheme,
            groups,
            args.save_buffer_size,
            env_info["episode_limit"] + 1,
            args.burn_in_period,
            preprocess=preprocess,
            device="cpu" if args.buffer_cpu_only else args.device)

    if args.is_batch_rl:
        assert (args.is_save_buffer == False)
        x_env_name = env_name
        if args.is_from_start:
            x_env_name += '_from_start/'
        path_name = '../../buffer/' + x_env_name + '/buffer_' + str(
            args.load_buffer_id) + '/'
        assert (os.path.exists(path_name) == True)
        buffer.load(path_name)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    if args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3' \
            or args.env == 'mmdp_game_1':
        last_demo_T = -args.demo_interval - 1

    while runner.t_env <= args.t_max:

        if not args.is_batch_rl:
            # Run for a whole episode at a time
            episode_batch = runner.run(test_mode=False)
            buffer.insert_episode_batch(episode_batch)

            if args.is_save_buffer:
                save_buffer.insert_episode_batch(episode_batch)
                if save_buffer.is_from_start and save_buffer.episodes_in_buffer == save_buffer.buffer_size:
                    save_buffer.is_from_start = False
                    save_one_buffer(args,
                                    save_buffer,
                                    env_name,
                                    from_start=True)
                if save_buffer.buffer_index % args.save_buffer_interval == 0:
                    print('current episodes_in_buffer: ',
                          save_buffer.episodes_in_buffer)

        for _ in range(args.num_circle):
            if buffer.can_sample(args.batch_size):
                episode_sample = buffer.sample(args.batch_size)

                if args.is_batch_rl:
                    runner.t_env += int(
                        th.sum(episode_sample['filled']).cpu().clone().detach(
                        ).numpy()) // args.batch_size

                # Truncate batch to only filled timesteps
                max_ep_t = episode_sample.max_t_filled()
                episode_sample = episode_sample[:, :max_ep_t]

                if episode_sample.device != args.device:
                    episode_sample.to(args.device)

                learner.train(episode_sample, runner.t_env, episode)

                if args.env == 'mmdp_game_1' and args.learner == "q_learner_exp":
                    for i in range(int(learner.target_gap) - 1):
                        episode_sample = buffer.sample(args.batch_size)

                        # Truncate batch to only filled timesteps
                        max_ep_t = episode_sample.max_t_filled()
                        episode_sample = episode_sample[:, :max_ep_t]

                        if episode_sample.device != args.device:
                            episode_sample.to(args.device)

                        learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)
        if args.env == 'mmdp_game_1' and \
                (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size):
            ### demo
            episode_sample = cp.deepcopy(buffer.sample(1))
            for i in range(args.n_actions):
                for j in range(args.n_actions):
                    new_actions = th.Tensor([i, j]).unsqueeze(0).repeat(
                        args.episode_limit + 1, 1)
                    if i == 0 and j == 0:
                        rew = th.Tensor([
                            1,
                        ])
                    else:
                        rew = th.Tensor([
                            0,
                        ])
                    if i == 1 and j == 1:
                        new_obs = th.Tensor(
                            [1, 0]).unsqueeze(0).unsqueeze(0).repeat(
                                args.episode_limit, args.n_agents, 1)
                    else:
                        new_obs = th.Tensor(
                            [0, 1]).unsqueeze(0).unsqueeze(0).repeat(
                                args.episode_limit, args.n_agents, 1)
                    # Truncate batch to only filled timesteps
                    max_ep_t = episode_sample.max_t_filled()
                    episode_sample = episode_sample[:, :max_ep_t]
                    episode_sample['actions'][0, :, :, 0] = new_actions
                    episode_sample['obs'][0, 1:, :, :] = new_obs
                    episode_sample['reward'][0, 0, 0] = rew
                    new_actions_onehot = th.zeros(
                        episode_sample['actions'].squeeze(3).shape +
                        (args.n_actions, ))
                    new_actions_onehot = new_actions_onehot.scatter_(
                        3, episode_sample['actions'].cpu(), 1)
                    episode_sample['actions_onehot'][:] = new_actions_onehot

                    if episode_sample.device != args.device:
                        episode_sample.to(args.device)

                    #print("action pair: %d, %d" % (i, j))
                    learner.train(episode_sample,
                                  runner.t_env,
                                  episode,
                                  show_demo=True,
                                  save_data=(i, j))
            last_demo_T = runner.t_env
            #time.sleep(1)

        if (args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3') and \
                (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size):
            ### demo
            episode_sample = cp.deepcopy(buffer.sample(1))
            for i in range(args.n_actions):
                for j in range(args.n_actions):
                    new_actions = th.Tensor([i, j]).unsqueeze(0).repeat(
                        args.episode_limit + 1, 1)
                    # Truncate batch to only filled timesteps
                    max_ep_t = episode_sample.max_t_filled()
                    episode_sample = episode_sample[:, :max_ep_t]
                    episode_sample['actions'][0, :, :, 0] = new_actions
                    new_actions_onehot = th.zeros(
                        episode_sample['actions'].squeeze(3).shape +
                        (args.n_actions, )).cuda()
                    new_actions_onehot = new_actions_onehot.scatter_(
                        3, episode_sample['actions'].cuda(), 1)
                    episode_sample['actions_onehot'][:] = new_actions_onehot
                    if i == 0 and j == 0:
                        rew = th.Tensor([
                            8,
                        ])
                    elif i == 0 or j == 0:
                        rew = th.Tensor([
                            -12,
                        ])
                    else:
                        rew = th.Tensor([
                            0,
                        ])
                    if args.env == 'matrix_game_3':
                        if i == 1 and j == 1 or i == 2 and j == 2:
                            rew = th.Tensor([
                                6,
                            ])
                    episode_sample['reward'][0, 0, 0] = rew

                    if episode_sample.device != args.device:
                        episode_sample.to(args.device)

                    #print("action pair: %d, %d" % (i, j))
                    learner.train(episode_sample,
                                  runner.t_env,
                                  episode,
                                  show_demo=True,
                                  save_data=(i, j))
            last_demo_T = runner.t_env
            #time.sleep(1)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            if args.double_q:
                os.makedirs(save_path + '_x', exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run * args.num_circle

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    if args.is_save_buffer and save_buffer.is_from_start:
        save_buffer.is_from_start = False
        save_one_buffer(args, save_buffer, env_name, from_start=True)

    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 5
0
def train(args, logger, learner, runner, buffer, engine_configuration_channel):
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    while runner.t_env <= args.t_max:
        engine_configuration_channel.set_configuration_parameters(
            time_scale=args.learning_time_scale)

        episode_batch = runner.run(test_mode=False)
        buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            # coordinator를 학습 합니다.
            learner.train(episode_sample, runner.t_env, episode)

        n_test_runs = max(1, args.test_nepisode // runner.batch_size)

        # 일정 주기로 Test를 진행 합니다.
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            engine_configuration_channel.set_configuration_parameters(
                time_scale=args.test_time_scale)
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        # 일정 주기로 학습된 가중치를 저장 합니다.
        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 6
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    to_index_flag = False
    if hasattr(args, 'to_index_flag'):
        if args.to_index_flag:
            to_index_flag = True
    # Set up schemes and groups here
    env_info = runner.get_env_info()
    # if args.disc_state:
    #     if args.env_args["map_name"] == '3m':
    #         state_num = 1077
    #         if to_index_flag:
    #             pass
    #         else:
    #             state_shape = state_num
    #     elif args.env_args["map_name"] == 'corridor':
    #         state_num = 5280
    #         if to_index_flag:
    #             pass
    #         else:
    #             state_shape = state_num
    #     elif args.env_args["map_name"] == '6h_vs_8z':
    #         state_num = 2884
    #         if to_index_flag:
    #             state_shape = 62
    #         else:
    #             state_shape = state_num
    #     elif args.env_args["map_name"] == '2s3z':
    #         state_num = 2325
    #         # state_num = 165
    #         if to_index_flag:
    #             state_shape = 20
    #         else:
    #             state_shape = state_num
    #     else:
    #         raise NotImplementedError
    # else:
    state_shape = env_info["state_shape"]
    state_num = env_info.get("state_num", None)
    # TEST
    # if args.env_args["map_name"] == '2s3z':
    #     state_shape = 120
    #     state_num = state_shape
    # state_shape = env_info["state_shape"]
    # state_num = env_info.get("state_num", None)
    # state_num = state_shape
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = state_shape
    args.state_num = state_num
    args.all_obs = env_info.get("all_obs", None)

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": state_shape
        },
        # "state": {"vshape": state_num},  # TEST
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
        "noise": {
            "vshape": (args.noise_dim, )
        }
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()
        runner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1

    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    # min_training_interval
    # training_interval_count = 0.0
    # episode_limit = env_info["episode_limit"]
    last_train_T = -env_info["episode_limit"] - 1
    # args.env_args.episode_limit
    # train_intervel_step = 0
    training_times = 0

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        time_stamp = time.time()

        episode_batch = runner.run(test_mode=False)
        buffer.insert_episode_batch(episode_batch)

        time_stamp = time_spent(time_stamp, 'Sampling')

        if buffer.can_sample(args.batch_size):
            if (runner.t_env -
                    last_train_T) / env_info["episode_limit"] >= 0.9:
                episode_sample = buffer.sample(args.batch_size)

                # Truncate batch to only filled timesteps
                training_times += 1
                logger.console_logger.info(
                    "t_env: {} / training_times {}".format(
                        runner.t_env, training_times))
                # print('training_times', training_times)
                max_ep_t = episode_sample.max_t_filled()
                episode_sample = episode_sample[:, :max_ep_t]

                if episode_sample.device != args.device:
                    episode_sample.to(args.device)

                time_stamp = time.time()

                learner.train(episode_sample, runner.t_env, episode)
                last_train_T = runner.t_env

                time_stamp = time_spent(time_stamp, 'Training')

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

            if args.noise_bandit:
                for _ in range(n_test_runs):
                    runner.run(test_mode=True, test_uniform=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.results_path, "models",
                                     args.unique_token, str(runner.t_env))
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)
            runner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 7
0
def run_sequential(args, logger):
    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]

    args.save_model = True  # 需要从外部设置

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    # ---------------------------------
    # 设置 multi-agent controller
    # ---------------------------------
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    # -------------------------
    # 如果 checkpoint_path 不为空,那么需要首先从 checkpoint_path 加载模型
    # -------------------------
    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        #
        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        # ----------------------------
        # 从本地加载模型
        # 1. 设置模型路径: args.checkpoint_path 对应 config/default.yaml 中的 checkpoint_path 配置项
        # 2. 加载模型
        # ----------------------------
        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        # ------------------------------
        # 如果 default.yaml 中 cal_max_expectation_tasks 参数为 true, 表示需要使用已经训练好的最优模型来进行最大期望任务量的计算,而不进行模型的训练。
        # ------------------------------
        if args.cal_max_expectation_tasks:
            cal_max_expectation_tasks(args, mac, learner, runner)
            return

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))

    global_reward = []
    global_state = []
    file_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output",
                             "train_reward.txt")
    state_path = os.path.join(os.path.dirname(__file__), "envs", "ec",
                              "output", "train_state.txt")

    test_state = []
    test_reward = []
    test_state_path = os.path.join(os.path.dirname(__file__), "envs", "ec",
                                   "output", "test_state.txt")
    test_reward_path = os.path.join(os.path.dirname(__file__), "envs", "ec",
                                    "output", "test_reward.txt")

    while runner.t_env <= args.t_max:  # t_env ?

        # Run for a whole episode at a time
        episode_batch = runner.run(
            test_mode=False)  # runner.run() 返回的是一个回合的数据。
        global_reward += get_episode_reward(
            episode_batch.data.transition_data)  # 将每一个 step 的 reward 都记录下来
        global_state += get_episode_state(
            episode_batch.data.transition_data)  # 将每个 step 的 state 都记录下来

        # 保存测试模式下的 state, reward 数据。 隔 args.reward_period 进行测试,测试的 state 数量为 args.reward_period。
        if runner.t_env % args.reward_period == 0:
            print(
                "---------------------------------测试模式中-----------------------------------------"
            )
            for i in range(int(args.reward_period / 20)):
                episode_data = runner.run(test_mode=True)  # 执行测试模式
                test_state += get_episode_state(
                    episode_data.data.transition_data)
                test_reward += get_episode_reward(
                    episode_data.data.transition_data)

        buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # Truncate batch to only filled timesteps
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    save_state_reward(state_path, global_state)
    save_state_reward(file_path, global_reward)
    save_state_reward(test_state_path, test_state)
    save_state_reward(test_reward_path, test_reward)
    logger.console_logger.info("Finished Training")
Exemplo n.º 8
0
def run_sequential(args, logger):
    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.episode_limit = env_info["episode_limit"]

    # Default/Base scheme
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (1,), "group": "agents", "dtype": th.long},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "reward": {"vshape": (1,)},
        "terminated": {"vshape": (1,), "dtype": th.uint8},
        "battle_won": {"vshape": (1,), "dtype": th.uint8},
    }
    groups = {
        "agents": args.n_agents
    }
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device,
                          save_episodes=True if args.save_episodes else False,
                          episode_dir=args.episode_dir,
                          clear_existing_episodes=args.clear_existing_episodes)  # TODO maybe just pass args

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    # Model learner
    model_learner = None
    model_buffer = None
    if args.model_learner:
        model_learner = le_REGISTRY[args.model_learner](mac, scheme, logger, args)
        model_buffer = ReplayBuffer(scheme, groups, args.model_buffer_size, buffer.max_seq_length,
                                    preprocess=preprocess,
                                    device="cpu" if args.buffer_cpu_only else args.device,
                                    save_episodes=False)

    if args.use_cuda:
        learner.cuda()
        if model_learner:
            model_learner.cuda()

    if args.checkpoint_path != "":
        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path))
            return

        timestep_to_load = 0
        if args.rl_checkpoint:
            rl_timesteps = []

            # Go through all files in args.checkpoint_path
            for name in os.listdir(args.checkpoint_path):
                full_name = os.path.join(args.checkpoint_path, name)
                # Check if they are dirs the names of which are numbers
                name = name.replace('rl_', '')
                if os.path.isdir(full_name) and name.isdigit():
                    rl_timesteps.append(int(name))

            load_step = int(args.load_step.replace('rl_', '')) if isinstance(args.load_step, str) else args.load_step
            if load_step == 0:
                # choose the max timestep
                timestep_to_load = max(rl_timesteps)
            else:
                # choose the timestep closest to load_step
                timestep_to_load = min(rl_timesteps, key=lambda x: abs(x - load_step))
                model_path = os.path.join(args.checkpoint_path, f"rl_{timestep_to_load}")

        else:
            timesteps = []

            # Go through all files in args.checkpoint_path
            for name in os.listdir(args.checkpoint_path):
                full_name = os.path.join(args.checkpoint_path, name)
                # Check if they are dirs the names of which are numbers
                if os.path.isdir(full_name) and name.isdigit():
                    timesteps.append(int(name))

            if args.load_step == 0:
                # choose the max timestep
                timestep_to_load = max(timesteps)
            else:
                # choose the timestep closest to load_step
                timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

                model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner, buffer)
            return

        # TODO checkpoints for model_learner

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    # new stuff
    collect_episodes = True
    collected_episodes = 0
    train_rl = False
    rl_iterations = 0
    model_trained = False
    n_model_trained = 0
    last_rl_T = 0
    rl_model_save_time = 0

    logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max))
    while runner.t_env <= args.t_max:

        if model_learner:
            if collect_episodes:
                episode_batch = runner.run(test_mode=False)  # collect real episode to progress t_env
                print(f"Collecting {args.batch_size_run} episodes from REAL ENV using epsilon: {runner.mac.env_action_selector.epsilon:.2f}, t_env: {runner.t_env}, collected episodes: {collected_episodes}")
                buffer.insert_episode_batch(episode_batch)
                collected_episodes += args.batch_size_run

            n_collect = args.model_n_collect_episodes if model_trained else args.model_n_collect_episodes_initial
            if collected_episodes >= n_collect:
                print(f"Collected {collected_episodes} REAL episodes, training ENV model")
                # stop collection and train model
                collect_episodes = False
                collected_episodes = 0
                model_learner.train(buffer, runner.t_env, plot_test_results=False)
                model_trained = True
                n_model_trained += 1
                train_rl = True

                if args.model_rollout_before_rl:
                    print(f"Generating {args.model_rollouts} MODEL episodes")
                    rollouts = 0
                    rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size)
                    while rollouts < args.model_rollouts:
                        model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations)
                        model_buffer.insert_episode_batch(model_batch)
                        rollouts += rollout_batch_size

            if train_rl: # and model_buffer.can_sample(args.batch_size):

                # generate synthetic episodes under current policy
                if not args.model_rollout_before_rl:
                    print(f"Generating {args.model_rollouts} MODEL episodes")
                    rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size)
                    model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations)
                    model_buffer.insert_episode_batch(model_batch)

                if model_buffer.can_sample(args.batch_size):
                    for _ in range(args.model_rl_iterations_per_generated_sample):
                        episode_sample = model_buffer.sample(args.batch_size)

                        # truncate batch to only filled timesteps
                        max_ep_t = episode_sample.max_t_filled()
                        episode_sample = episode_sample[:, :max_ep_t]

                        if episode_sample.device != args.device:
                            episode_sample.to(args.device)

                        # train RL agent
                        learner.train(episode_sample, runner.t_env, rl_iterations)
                        rl_iterations += 1
                        print(f"Model RL iteration {rl_iterations}, t_env: {runner.t_env}")

            if not collect_episodes and rl_iterations > 0 and rl_iterations % args.model_update_interval == 0:
                if args.max_model_trained == 0 or args.max_model_trained and n_model_trained < args.max_model_trained:
                    print(f"Time to update model")
                    collect_episodes = True
                    train_rl = False

            # update stats
            model_learner.log_stats(runner.t_env)
            if (runner.t_env - last_log_T) >= args.log_interval:
                logger.log_stat("model_rl_iterations", rl_iterations, runner.t_env)
            if (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0):
                print(f"Logging rl stats")
                model_learner.log_rl_stats(rl_iterations)

        else:
            episode_batch = runner.run(test_mode=False)
            buffer.insert_episode_batch(episode_batch)
            if args.save_episodes and args.save_policy_outputs and args.runner == "episode":
                mac.save_policy_outputs()
            if buffer.can_sample(args.batch_size):
                for _ in range(args.batch_size_run):
                    episode_sample = buffer.sample(args.batch_size)

                    # Truncate batch to only filled timesteps
                    max_ep_t = episode_sample.max_t_filled()
                    episode_sample = episode_sample[:, :max_ep_t]

                    if episode_sample.device != args.device:
                        episode_sample.to(args.device)

                    learner.train(episode_sample, runner.t_env, episode)
                    rl_iterations += 1
                    print(f"RL iteration {rl_iterations}, t_env: {runner.t_env}")

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if ((runner.t_env - last_test_T) / args.test_interval >= 1.0) or (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0):

            print("Running test cases")

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            last_rl_T = rl_iterations
            runner.t_rl = rl_iterations

            for _ in range(n_test_runs):
                runner.run(test_mode=True)

            logger.print_recent_stats()

        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        if args.save_model and model_trained and (rl_iterations == 0 or (rl_iterations - rl_model_save_time)/args.rl_save_model_interval >= 1.0):
            print(f"Saving at RL model iteration {rl_iterations}")
            rl_model_save_time = rl_iterations
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, f"rl_{rl_iterations}")
            # "results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("rl_iterations", rl_iterations, runner.t_env)
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 9
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Setup schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]

    # Default/base scheme
    reward_dict = {"vshape": (1,), "group": "agents", "dtype": th.float32} if args.env_args["reward_local"] else {"vshape": (1,)}
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (1,), "group": "agents", "dtype": th.long},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "reward": reward_dict,
        "terminated": {"vshape": (1,), "dtype": th.uint8},
    }
    # TODO: what is groups controlling?
    groups = {
        "agents": args.n_agents
    }
    # TODO: where/how is pre processing applied?
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    # TODO: why create replaybuffer with episode limit + 1?
    # Setup replaybuffer
    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multi-agent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Setup runner with created scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Setup learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    # Activate CUDA
    if args.use_cuda:
        learner.cuda()

    # Load checkpoint if necessary
    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        # Check checkpoint path integrity -> exist or else no model can be loaded later
        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directory {} doesn't exist".format(args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        # TODO: enforce learner loading correct model?
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    #
    # Start training
    #
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max))

    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time -> runner returns a episode batch
        episode_batch = runner.run(test_mode=False)
        # Save episode in replay buffer
        buffer.insert_episode_batch(episode_batch)

        # If enough episodes saved -> sample
        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # Truncate batch to only filled timesteps
            # TODO: explain max_t_filled
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            # TODO: when is device differing?!
            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            # Train on sampled episodes
            learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        # Save model after certain time
        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        # Increase total episode counter by batch size of episodes currently run
        # TODO: follow batch_size_run!
        episode += args.batch_size_run

        # Log stats in interval
        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 10
0
def run_reptile(args, logger, _log, _run):

    loggers = {}
    runners = {}
    macs = {}
    learners = {}
    buffers = {}

    agent_state_dict = None

    import yaml
    #from .main import _get_config
    # compile all the relevant task configs
    task_configs = {}

    class Bunch(object):
        def __init__(self, adict):
            self.__dict__.update(adict)

    r = np.random.RandomState(args.seed)
    for k, v in sorted(args.tasks.items()): # important for reproducibility of seeds!

        # Get the defaults from default.yaml
        with open(os.path.join(os.path.dirname(__file__), "config", "default.yaml"), "r") as f:
            try:
                config_dict = yaml.load(f)
            except yaml.YAMLError as exc:
                assert False, "default.yaml error: {}".format(exc)

        # Load algorithm and env base configs
        params = ["", "--config={}".format(v.pop("config")), "--env-config={}".format(v.pop("env-config"))]
        alg_config = _get_config(params, "--config", "algs")
        env_config = _get_config(params, "--env-config", "envs")

        # config_dict = {**config_dict, **env_config, **alg_config}
        config_dict = recursive_dict_update(config_dict, env_config)
        config_dict = recursive_dict_update(config_dict, alg_config)
        config_dict = recursive_dict_update(config_dict, v)

        # from src.utils.dict2namedtuple import convert
        config_dict.pop("no-mongo")
        config_dict["seed"] = r.randint(0, 2**31-1) # have to set manually
        config_dict["env_args"]["seed"] = r.randint(0, 2**31-1)
        config_dict["device"] = args.device
        config_dict["unique_token"] = "{}__{}".format(args.unique_token,
                                                     k)
        task_configs[k] = Bunch(config_dict)

    def setup_components(logger,
                         agent_state_dict):
        task_names = []
        for task_name, _ in task_configs.items():
            task_names.append(task_name)

        # set up tasks based on the configs
        for task_name, task_config in task_configs.items():

            task_args = task_config

            from copy import deepcopy
            logger = Logger(_log)
            # sacred is on by default
            logger.setup_sacred(_run)
            # logger = deepcopy(meta_logger)
            logger.prefix = task_name
            loggers[task_name] = logger

            # Init runner so we can get env info
            runner = r_REGISTRY[task_args.runner](args=task_args,
                                                  logger=logger)
            runners[task_name] = runner

            # Set up schemes and groups here
            env_info = runner.get_env_info()
            task_args.n_agents = env_info["n_agents"]
            task_args.n_actions = env_info["n_actions"]
            task_args.obs_decoder = dill.loads(env_info["obs_decoder"]) if env_info["obs_decoder"] is not None else None
            task_args.avail_actions_encoder = dill.loads(env_info["avail_actions_encoder_grid"]) if env_info[
                                                                                                   "avail_actions_encoder_grid"] is not None else None
            task_args.db_url = args.db_url
            task_args.db_name = args.db_name
            task_args.state_shape = env_info["state_shape"]
            task_args.state_decoder = dill.loads(env_info["state_decoder"]) if env_info["state_decoder"] is not None else None
            task_args.obs_decoder = dill.loads(env_info["obs_decoder"]) if env_info["obs_decoder"] is not None else None

            # Default/Base scheme
            scheme = {
                "state": {"vshape": env_info["state_shape"]},
                "obs": {"vshape": env_info["obs_shape"], "group": "agents",
                        "vshape_decoded": env_info.get("obs_shape_decoded", env_info["obs_shape"])},
                "actions": {"vshape": (1,), "group": "agents", "dtype": th.long},
                "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
                "reward": {"vshape": (1,)},
                "terminated": {"vshape": (1,), "dtype": th.uint8},
            }
            groups = {
                "agents": task_args.n_agents
            }
            preprocess = {
                "actions": ("actions_onehot", [OneHot(out_dim=task_args.n_actions)])
            }

            buffer = ReplayBuffer(scheme, groups, task_args.buffer_size, env_info["episode_limit"] + 1,
                                  preprocess=preprocess,
                                  device="cpu" if task_args.buffer_cpu_only else args.device)
            buffers[task_name] = buffer

            # Setup multiagent controller here
            mac = mac_REGISTRY[task_args.mac](buffer.scheme, groups, task_args)

            #point model to same object
            macs[task_name] = mac
            mac.agent = macs[task_names[0]].agent

            # Give runner the scheme
            runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

            # Learner
            learner = le_REGISTRY[task_args.learner](mac, buffer.scheme, logger, task_args)
            learners[task_name] = learner

            if task_args.use_cuda:
                learner.cuda()

            #if agent_state_dict is None:
            #    agent_state_dict = mac.agent.state_dict()
            # else:
            #    # copy all weights that have same dimensions
            #    sd = mac.agent.state_dict()
            #    for k, v in agent_state_dict.items():
            #        if (k in sd) and (v.shape == sd[k].shape):
            #            setattr(mac.agent, k, v)


            if task_args.checkpoint_path != "":

                timesteps = []
                timestep_to_load = 0

                if not os.path.isdir(task_args.checkpoint_path):
                    logger.console_logger.info("Checkpoint directory {} doesn't exist".format(task_args.checkpoint_path))
                    return

                # Go through all files in args.checkpoint_path
                for name in os.listdir(task_args.checkpoint_path):
                    full_name = os.path.join(task_args.checkpoint_path, name)
                    # Check if they are dirs the names of which are numbers
                    if os.path.isdir(full_name) and name.isdigit():
                        timesteps.append(int(name))

                if task_args.load_step == 0:
                    # choose the max timestep
                    timestep_to_load = max(timesteps)
                else:
                    # choose the timestep closest to load_step
                    timestep_to_load = min(timesteps, key=lambda x: abs(x - task_args.load_step))

                model_path = os.path.join(task_args.checkpoint_path, str(timestep_to_load))

                logger.console_logger.info("Loading model from {}".format(model_path))
                learner.load_models(model_path)
                runner.t_env = timestep_to_load

                if task_args.evaluate or task_args.save_replay:
                    evaluate_sequential(task_args, runner)
                    return
        return


    from copy import deepcopy
    # agent_state_dict = setup_components(logger, agent_state_dict)
    setup_components(logger, agent_state_dict)

    # start reptile training
    episode_ctrs = {k:0 for k, _ in sorted(task_configs.items())}
    last_test_Ts = {k:-v.test_interval - 1 for k, v in sorted(task_configs.items())}
    last_times = {k:time.time() for k, v in sorted(task_configs.items())}
    model_save_times = {k:0 for k, _ in sorted(task_configs.items())}
    start_time = time.time()

    logger.console_logger.info("Beginning REPTILE training ...")

    previous_task_id = None
    unfinished_tasks = {k for k, v in task_configs.items() if episode_ctrs[k] <= v.t_max}
    while len(unfinished_tasks):
        # INNER LOOP
        unfinished_tasks = {k for k, v in task_configs.items() if episode_ctrs[k] <=v.t_max}

        # pick task
        from random import randint
        task_id = sorted(list(unfinished_tasks))[randint(0, len(unfinished_tasks)-1)]

        logger.console_logger.info("Chose task {} at global counter {}".format(task_id, sum(episode_ctrs.values())))

        # roll out task a couple of times
        for t in range(args.n_task_rollouts[task_id]):
            episode_batch = runners[task_id].run(test_mode=False)
            buffers[task_id].insert_episode_batch(episode_batch)
            # train on task
            episode_ctrs[task_id] += 1
            if episode_ctrs[task_id] >= task_configs[task_id].t_max:
                break

        # reset mac weights
        # copy all weights that have same dimensions from last chosen task (not sure whether this is not redundant)
        if previous_task_id is not None:
            sd = macs[task_id].agent.state_dict()
            for k, v in macs[previous_task_id].agent.state_dict().items():
                if (k in sd) and (v.shape == sd[k].shape):
                    setattr(macs[task_id].agent, k, v)

        # train
        for t in range(args.n_task_trains[task_id]):

            if buffers[task_id].can_sample(task_configs[task_id].batch_size):
                episode_sample = buffers[task_id].sample(task_configs[task_id].batch_size)
                max_ep_t = episode_sample.max_t_filled()
                episode_sample = episode_sample[:, :max_ep_t]
                if episode_sample.device != task_configs[task_id].device:
                    episode_sample.to(task_configs[task_id].device)

                learners[task_id].train(episode_sample,
                                        runners[task_id].t_env,
                                        episode_ctrs[task_id])

        # update weights of same dimensions using simple rule (otherwise: formulate as a gradient procedure)
        import operator
        for _task_id, _ in sorted(task_configs.items()):
            mac_state_dict = macs[task_id].agent.state_dict()
            if _task_id != task_id:
                _mac_state_dict = macs[_task_id].agent.state_dict()
                for k, v in _mac_state_dict.items():
                    if (k in mac_state_dict) and (v.shape == mac_state_dict[k].shape):
                        new_weights = operator.attrgetter(k)(macs[_task_id].agent) + args.reptile_epsilon * (mac_state_dict[k] - v)
                        setattr(macs[_task_id].agent, k, new_weights)
                        # agent_state_dict[k] += args.reptile_epsilon * (mac_state_dict[k] - macs[_task_id].agent.state_dict()[k])


        for task_id, task_config in task_configs.items():
            # Execute test runs once in a while
            n_test_runs = max(1, task_configs[task_id].test_nepisode // runners[task_id].batch_size)
            if (runners[task_id].t_env - last_test_Ts[task_id]) / task_configs[task_id].test_interval >= 1.0:
                loggers[task_id].console_logger.info("Now testing: {}".format(task_id))
                loggers[task_id].console_logger.info("t_env: {} / {}".format(runners[task_id].t_env,
                                                                             task_configs[task_id].t_max))
                loggers[task_id].console_logger.info("Estimated time left: {}. Time passed: {}".format(
                    time_left(last_times[task_id],
                              last_test_Ts[task_id],
                              runners[task_id].t_env,
                              task_configs[task_id].t_max),
                    time_str(time.time() - start_time)))
                last_times[task_id] = time.time()

                last_test_Ts[task_id] = runners[task_id].t_env
                for _ in range(n_test_runs):
                    runners[task_id].run(test_mode=True)

        previous_task_id = task_id

        for task_id, task_config in task_configs.items():
            if task_config.save_model and \
                    (runners[task_id].t_env - model_save_times[task_id] >= task_config.save_model_interval or
                     model_save_times[task_id] == 0):
                model_save_times[task_id] = runners[task_id].t_env
                save_path = os.path.join(task_config.local_results_path,
                                         "models",
                                         task_config.unique_token,
                                         str(runners[task_id].t_env))
                #"results/models/{}".format(unique_token)
                os.makedirs(save_path, exist_ok=True)
                logger.console_logger.info("Saving models to {}".format(save_path))

                # learner should handle saving/loading -- delegate actor save/load to mac,
                # use appropriate filenames to do critics, optimizer states
                learners[task_id].save_models(save_path)
Exemplo n.º 11
0
def run_sequential(args, logger):
    """
    真正运行函数
    :param args:
    :type args:
    :param logger:
    :type logger:
    :return:
    :rtype:
    """
    # init runner所以我们可以得到env info, 运行哪个runner,是src/runners/parallel_runner.py中的ParallelRunner  还是episode_runner.py
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # 在此设置schemes和groups
    env_info = runner.get_env_info()
    # agent的数量 eg: 8
    args.n_agents = env_info["n_agents"]
    # 动作的数量 eg: 6
    args.n_actions = env_info["n_actions"]
    # agent状态的维度: 300
    args.state_shape = env_info["state_shape"]

    if getattr(args, 'agent_own_state_size', False):
        args.agent_own_state_size = get_agent_own_state_size(args.env_args)

    # 自定义schema
    scheme = {
        "state": {"vshape": env_info["state_shape"]},
        "obs": {"vshape": env_info["obs_shape"], "group": "agents"},
        "actions": {"vshape": (1,), "group": "agents", "dtype": th.long},
        "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int},
        "probs": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.float},
        "reward": {"vshape": (1,)},
        "terminated": {"vshape": (1,), "dtype": th.uint8},
    }
    groups = {
        "agents": args.n_agents
    }
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }
    # 重放buffer
    buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1,
                          preprocess=preprocess,
                          device="cpu" if args.buffer_cpu_only else args.device)
    # 在此设置多agent控制器,调用src/controllers/n_controller.py中的NMAC函数
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # 给runner这个schema
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner, 调用src/learners/nq_learner.py下的NQLearner初始化, 不同的算法初始化
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    if args.checkpoint_path != "":
        # 加载checkpoint,继续训练
        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path))
            return

        # 遍历args.checkpoint_path中的所有文件
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # 检查它们是否是Dirs的名称是数字
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # 开始训练
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("开始训练,训练的 {} 个时间步".format(args.t_max))

    while runner.t_env <= args.t_max:

        # 一个时间步运行一个episode

        with th.no_grad():
            episode_batch = runner.run(test_mode=False)
            buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size):
            episode_sample = buffer.sample(args.batch_size)

            # 截断批次只保留有时间步的
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = episode_sample[:, :max_ep_t]

            if episode_sample.device != args.device:
                episode_sample.to(args.device)

            learner.train(episode_sample, runner.t_env, episode)
            del episode_sample

        # 执行测试运行一次
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max))
            logger.console_logger.info("Estimated time left: {}. Time passed: {}".format(
                time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("完成训练")
Exemplo n.º 12
0
def run_sequential(args, logger):

    # Init runner so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)
    th.autograd.set_detect_anomaly(True)
    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    args.obs_shape = env_info["obs_shape"]

    #    args.own_feature_size = env_info["own_feature_size"] #unit_type_bits+shield_bits_ally
    #if args.obs_last_action:
    #    args.own_feature_size+=args.n_actions
    #if args.obs_agent_id:
    #    args.own_feature_size+=args.n_agents

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
    }
    if args.learner == "hierarchical_rode_learner":
        scheme.update({
            "role_avail_actions": {
                "vshape": (env_info["n_actions"], ),
                "group": "agents",
                "dtype": th.int
            },
            "roles": {
                "vshape": (1, ),
                "group": "agents",
                "dtype": th.long
            }
        })
    if args.learner == "hierarchical_noise_q_learner":
        scheme.update({"noise": {"vshape": (args.noise_dim, )}})
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    if args.q_net_ensemble:
        mac = [
            mac_REGISTRY[args.mac](buffer.scheme, groups, args)
            for _ in range(args.ensemble_num)
        ]
    else:
        mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()
        if args.runner == "meta_noise":
            runner.cuda()

    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                "Checkpoint directiory {} doesn't exist".format(
                    args.checkpoint_path))
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info("Loading model from {}".format(model_path))
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return

    # start training
    episode = 0
    last_test_T = -args.test_interval - 1
    if args.meta_h:
        last_meta_T = -args.meta_h_interval - 1
        meta_buffer = ReplayBuffer(
            scheme,
            groups,
            args.batch_size,
            env_info["episode_limit"] + 1,
            preprocess=preprocess,
            device="cpu" if args.buffer_cpu_only else args.device)
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info("Beginning training for {} timesteps".format(
        args.t_max))
    use_rode = True if args.learner == "hierarchical_rode_learner" else False
    meta_start_t = 0
    if args.learner == "hierarchical_rode_learner":
        meta_start_t = args.role_action_spaces_update_start
    if args.save_batch_interval > 0:
        last_save_batch = -args.save_batch_interval - 1
    whole_q_list = []
    if args.save_q_all:
        q_list_ind = 0
    while runner.t_env <= args.t_max:

        # Run for a whole episode at a time
        # if args.meta_h:
        #     episode_batch, batch_log_p, mean_step_returns = runner.run(test_mode=False, meta_mode=True)
        # else:
        #     episode_batch, _ = runner.run(test_mode=False) #[8,181,10,1] for actions
        episode_batch, _ = runner.run(
            test_mode=False, use_rode=use_rode)  #[8,181,10,1] for actions
        buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size) and args.meta_h and \
            (runner.t_env - last_meta_T) / args.meta_h_interval >= 1.0 and runner.t_env >= meta_start_t:
            repeat_times = args.batch_size // runner.batch_size
            # meta_buffer.insert_episode_batch(episode_batch)
            batch_log_p_all = []
            mean_step_returns_all = []
            for _ in range(repeat_times):
                #[8]
                # episode_batch, batch_log_p, mean_step_returns = runner.run_meta(test_mode=False, meta_mode=True)
                # batch_log_p_all.append(batch_log_p)
                episode_batch, _, mean_step_returns = runner.run_meta(
                    test_mode=False, meta_mode=True, use_rode=use_rode)
                mean_step_returns_all += mean_step_returns
                buffer.insert_episode_batch(episode_batch[0])
                meta_buffer.insert_episode_batch(episode_batch)
            #[32]
            # batch_log_p_all = th.cat(batch_log_p_all, dim=0)
            for _ in range(repeat_times):
                episode = prep_ep_and_train(meta_buffer, args, learner,
                                            episode, runner.t_env,
                                            whole_q_list)
            mean_step_returns_new_all = []
            for _ in range(repeat_times):
                episode_batch_new, mean_step_returns_new = runner.run_meta(
                    test_mode=False, use_rode=use_rode)
                buffer.insert_episode_batch(episode_batch_new[0])
                mean_step_returns_new_all += mean_step_returns_new
            #need to get batch_log_p_here
            batch_log_p_all = runner.get_log_p(meta_buffer)
            learner.train_meta(batch_log_p_all, mean_step_returns_all,
                               mean_step_returns_new_all, runner.t_env)
            for _ in range(repeat_times):
                episode = prep_ep_and_train(buffer, args, learner, episode,
                                            runner.t_env, whole_q_list)
            last_meta_T = runner.t_env
        elif buffer.can_sample(args.batch_size):
            prep_ep_and_train(buffer, args, learner, episode, runner.t_env,
                              whole_q_list)
            # episode_sample = buffer.sample(args.batch_size) #[32,181,10,1] for actions

            # # Truncate batch to only filled timesteps
            # max_ep_t = episode_sample.max_t_filled()
            # episode_sample = episode_sample[:, :max_ep_t]

            # if episode_sample.device != args.device:
            #     episode_sample.to(args.device)

            # learner.train(episode_sample, runner.t_env, episode)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info("t_env: {} / {}".format(
                runner.t_env, args.t_max))
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            save_batch_flag = False
            discount = 1.0 if args.t_max // 5 <= runner.t_env else 10.0
            if args.save_batch_interval > 0 and (
                    runner.t_env - last_save_batch) / (
                        args.save_batch_interval // discount) >= 1.0:
                save_batch_flag = True
                last_save_batch = runner.t_env
            for i in range(n_test_runs):
                if args.runner == "meta" or args.runner == "meta_noise":
                    runner.run_meta(test_mode=True, use_rode=use_rode)
                else:
                    runner.run(test_mode=True, use_rode=use_rode)
                if save_batch_flag:
                    save_batch(runner.batch, osp.join(args.tb_logs, "batch"),
                               runner.t_env, i)
            if args.noise_bandit:
                for _ in range(n_test_runs):
                    runner.run_meta(test_mode=True, test_uniform=True)

        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info("Saving models to {}".format(save_path))

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run if args.runner != "meta" and args.runner != "meta_noise" else 1

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env
        if args.save_q_all and len(whole_q_list) >= 4000:
            save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind)
            whole_q_list.clear()
            q_list_ind += 1

    if args.save_q_all and len(whole_q_list) > 0:
        save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind)
    runner.close_env()
    logger.console_logger.info("Finished Training")
Exemplo n.º 13
0
def run_sequential(args, logger):

    # Init runner(episode runner or parallel runner) so we can get env info
    runner = r_REGISTRY[args.runner](args=args, logger=logger)

    # Set up schemes and groups here
    env_info = runner.get_env_info()
    args.n_agents = env_info["n_agents"]  # from smac maps
    args.n_actions = env_info["n_actions"]
    args.state_shape = env_info["state_shape"]
    # args.unit_type_bits = env_info["unit_type_bits"]
    # args.shield_bits_ally = env_info["shield_bits_ally"]
    # args.shield_bits_enemy = env_info["shield_bits_enemy"]
    # args.n_enemies = env_info["n_enemies"]

    # Default/Base scheme
    scheme = {
        "state": {
            "vshape": env_info["state_shape"]
        },
        "obs": {
            "vshape": env_info["obs_shape"],
            "group": "agents"
        },
        "actions": {
            "vshape": (1, ),
            "group": "agents",
            "dtype": th.long
        },
        "avail_actions": {
            "vshape": (env_info["n_actions"], ),
            "group": "agents",
            "dtype": th.int
        },
        "reward": {
            "vshape": (1, )
        },
        "terminated": {
            "vshape": (1, ),
            "dtype": th.uint8
        },
        #"policy": {"vshape": (env_info["n_agents"],)}
    }
    groups = {"agents": args.n_agents}
    preprocess = {
        "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)])
    }

    buffer = ReplayBuffer(
        scheme,
        groups,
        args.buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)
    off_buffer = ReplayBuffer(
        scheme,
        groups,
        args.off_buffer_size,
        env_info["episode_limit"] + 1,
        preprocess=preprocess,
        device="cpu" if args.buffer_cpu_only else args.device)

    # Setup multiagent controller here
    mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args)

    # Give runner the scheme
    runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac)

    # Learner
    learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args)

    if args.use_cuda:
        learner.cuda()

    runner.set_learner(learner)

    ###### If checkpoint_path is given, and if args.evaluate == True or args.save_replay == True,
    ###### then, this function is returned without training.
    if args.checkpoint_path != "":

        timesteps = []
        timestep_to_load = 0

        if not os.path.isdir(args.checkpoint_path):
            logger.console_logger.info(
                f"Checkpoint directiory {args.checkpoint_path} doesn't exist")
            return

        # Go through all files in args.checkpoint_path
        for name in os.listdir(args.checkpoint_path):
            full_name = os.path.join(args.checkpoint_path, name)
            # Check if they are dirs the names of which are numbers
            if os.path.isdir(full_name) and name.isdigit():
                timesteps.append(int(name))

        if args.load_step == 0:
            # choose the max timestep
            timestep_to_load = max(timesteps)
        else:
            # choose the timestep closest to load_step
            timestep_to_load = min(timesteps,
                                   key=lambda x: abs(x - args.load_step))

        model_path = os.path.join(args.checkpoint_path, str(timestep_to_load))

        logger.console_logger.info(f"Loading model from {model_path}")
        learner.load_models(model_path)
        runner.t_env = timestep_to_load

        if args.evaluate or args.save_replay:
            evaluate_sequential(args, runner)
            return
    ########################################################################################################

    ######## start training
    episode = 0
    last_test_T = -args.test_interval - 1
    last_log_T = 0
    model_save_time = 0

    start_time = time.time()
    last_time = start_time

    logger.console_logger.info(
        f"Beginning training for {args.t_max} timesteps")

    while runner.t_env <= args.t_max:

        # critic running log
        running_log = {
            "critic_loss": [],
            "critic_grad_norm": [],
            "td_error_abs": [],
            "target_mean": [],
            "q_taken_mean": [],
            "q_max_mean": [],
            "q_min_mean": [],
            "q_max_var": [],
            "q_min_var": []
        }

        # Run for a whole episode at a time
        episode_batch = runner.run(test_mode=False)
        buffer.insert_episode_batch(episode_batch)
        off_buffer.insert_episode_batch(episode_batch)

        if buffer.can_sample(args.batch_size) and off_buffer.can_sample(
                args.off_batch_size):
            #train critic normall
            uni_episode_sample = buffer.uni_sample(args.batch_size)
            off_episode_sample = off_buffer.uni_sample(args.off_batch_size)
            max_ep_t = max(uni_episode_sample.max_t_filled(),
                           off_episode_sample.max_t_filled())
            uni_episode_sample = process_batch(
                uni_episode_sample[:, :max_ep_t], args)
            off_episode_sample = process_batch(
                off_episode_sample[:, :max_ep_t], args)
            learner.train_critic(uni_episode_sample,
                                 best_batch=off_episode_sample,
                                 log=running_log)

            #train actor
            episode_sample = buffer.sample_latest(args.batch_size)
            max_ep_t = episode_sample.max_t_filled()
            episode_sample = process_batch(episode_sample[:, :max_ep_t], args)
            learner.train(episode_sample, runner.t_env, running_log)

        # Execute test runs once in a while
        n_test_runs = max(1, args.test_nepisode // runner.batch_size)
        if (runner.t_env - last_test_T) / args.test_interval >= 1.0:

            logger.console_logger.info(f"t_env: {runner.t_env} / {args.t_max}")
            logger.console_logger.info(
                "Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, last_test_T, runner.t_env,
                              args.t_max), time_str(time.time() - start_time)))
            last_time = time.time()

            last_test_T = runner.t_env
            for _ in range(n_test_runs):
                runner.run(test_mode=True)

        # Save model every {save_model_interval} timesteps
        if args.save_model and (
                runner.t_env - model_save_time >= args.save_model_interval
                or model_save_time == 0):
            model_save_time = runner.t_env
            save_path = os.path.join(args.local_results_path, "models",
                                     args.unique_token, str(runner.t_env))
            #"results/models/{}".format(unique_token)
            os.makedirs(save_path, exist_ok=True)
            logger.console_logger.info(f"Saving models to {save_path}")

            # learner should handle saving/loading -- delegate actor save/load to mac,
            # use appropriate filenames to do critics, optimizer states
            learner.save_models(save_path)

        episode += args.batch_size_run

        if (runner.t_env - last_log_T) >= args.log_interval:
            logger.log_stat("episode", episode, runner.t_env)
            logger.print_recent_stats()
            last_log_T = runner.t_env

    runner.close_env()
    logger.console_logger.info("Finished Training")