Exemplo n.º 1
0
    def _save_training_info(self):
        """
        Function to save the training info.
        """

        # update the status dictionary
        self.status = {
            "num_frames": self.num_frames,
            "update": self.update,
            "model_state": self.acmodel.state_dict(),
            "optimizer_state": self.algo.optimizer.state_dict()
        }

        if hasattr(self.preprocess_obss,
                   "vocab"):  # if we are using NLP save, NLP info
            self.status["vocab"] = self.preprocess_obss.vocab.vocab

        utils.save_status(self.status,
                          self.model_dir)  # save the status info to model_dir
        self.txt_logger.info("Status saved")
Exemplo n.º 2
0
            data += rreturn_per_episode.values()
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "success_rate", "policy_loss", "value_loss", "grad_norm"]
            data += [logs["entropy"], logs["value"], success_per_episode["mean"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"]]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | G {} | rR:uomM {:.2f} {:.2f} {:.2f} {:.2f} | F:uomM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | S {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data).encode('utf-8'))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {"num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(),
                    "agent_goals": algo.goals, "optimizer_state": algo.optimizer.state_dict()}
            if hasattr(preprocess_obs_goals, "vocab"):
                status["vocab"] = preprocess_obs_goals.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
Exemplo n.º 3
0
        ]

        logger.info(
            "U {} | F {:06} | FPS {:04.0f} | D {} | rR:x̄σmM {:.2f} {:.2f} {:.2f} {:.2f} | F:x̄σmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
            .format(*data))

        header += ["return_" + key for key in return_per_episode.keys()]
        data += return_per_episode.values()

        if not (status["num_frames"]):
            csv_writer.writerow(header)
        csv_writer.writerow(data)

        if args.tb:
            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        status = {"num_frames": num_frames, "update": update}
        utils.save_status(status, save_dir)

    # Save vocabulary and model

    if args.save_interval > 0 and update % args.save_interval == 0:
        preprocess_obss.vocab.save()

        if torch.cuda.is_available():
            acmodel.cpu()
        utils.save_model(acmodel, save_dir)
        logger.info("Model successfully saved")
        if torch.cuda.is_available():
            acmodel.cuda()
Exemplo n.º 4
0
        txt_logger.info(
            "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | ARPS: {:.3f} | ADR: {:.3f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
            .format(*data))

        header += ["return_" + key for key in return_per_episode.keys()]
        data += return_per_episode.values()

        if status["num_frames"] == 0:
            csv_logger.writerow(header)
        csv_logger.writerow(data)
        csv_file.flush()

        for field, value in zip(header, data):
            tb_writer.add_scalar(field, value, num_frames)

    # Save status

    if args.save_interval > 0 and update % args.save_interval == 0:
        status = {"num_frames": num_frames, "update": update,
                  "model_state": algo.acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
        if hasattr(preprocess_obss, "vocab") and preprocess_obss.vocab is not None:
            status["vocab"] = preprocess_obss.vocab.vocab
        utils.save_status(status, model_dir + "/train")
        txt_logger.info("Status saved")

        if args.eval:
            # we send the num_frames to align the eval curves with the training curves on TB
            for evalu in evals:
                evalu.eval(num_frames, episodes=args.eval_episodes)
Exemplo n.º 5
0
def main():
    # Parse arguments

    parser = argparse.ArgumentParser()

    ## General parameters
    parser.add_argument(
        "--algo",
        required=True,
        help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)")
    parser.add_argument("--env",
                        required=True,
                        help="name of the environment to train on (REQUIRED)")
    parser.add_argument(
        "--model",
        default=None,
        help="name of the model (default: {ENV}_{ALGO}_{TIME})")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval",
                        type=int,
                        default=1,
                        help="number of updates between two logs (default: 1)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=10,
        help=
        "number of updates between two saves (default: 10, 0 means no saving)")
    parser.add_argument("--procs",
                        type=int,
                        default=16,
                        help="number of processes (default: 16)")
    parser.add_argument("--frames",
                        type=int,
                        default=10**7,
                        help="number of frames of training (default: 1e7)")

    ## Parameters for main algorithm
    parser.add_argument("--epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="batch size for PPO (default: 256)")
    parser.add_argument(
        "--frames-per-proc",
        type=int,
        default=None,
        help=
        "number of frames per process before update (default: 5 for A2C and 128 for PPO)"
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="learning rate (default: 0.001)")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.95,
        help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)"
    )
    parser.add_argument("--entropy-coef",
                        type=float,
                        default=0.01,
                        help="entropy term coefficient (default: 0.01)")
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument(
        "--optim-eps",
        type=float,
        default=1e-8,
        help="Adam and RMSprop optimizer epsilon (default: 1e-8)")
    parser.add_argument("--optim-alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha (default: 0.99)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument(
        "--recurrence",
        type=int,
        default=1,
        help=
        "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory."
    )
    parser.add_argument("--text",
                        action="store_true",
                        default=False,
                        help="add a GRU to the model to handle text input")
    parser.add_argument("--visualize",
                        default=False,
                        help="show real time CNN layer weight changes")

    args = parser.parse_args()

    args.mem = args.recurrence > 1

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"

    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps,
                                preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs,
                                args.batch_size, preprocess_obss)

    elif args.algo == "ppo_intrinsic":
        algo = torch_ac.PPOAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)
    elif args.algo == "a2c_intrinsic":
        algo = torch_ac.A2CAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_alpha,
            args.optim_eps, preprocess_obss)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    print_visual = args.visualize
    if print_visual:
        fig, axs = plt.subplots(1, 3)
        fig.suptitle('Convolution Layer Weights Normalized Difference')

    while num_frames < args.frames:

        # Store copies of s_t model params
        old_parameters = {}
        for name, param in acmodel.named_parameters():
            old_parameters[name] = param.detach().numpy().copy()

        # Update model parameters
        update_start_time = time.time()
        exps, logs1 = algo.collect_experiences()
        logs2 = algo.update_parameters(exps)
        logs = {**logs1, **logs2}
        update_end_time = time.time()

        # Store copies of s_t+1 model params
        new_parameters = {}
        for name, param in acmodel.named_parameters():
            new_parameters[name] = param.detach().numpy().copy()

        # Compute L2 Norm of model state differences
        # Print model weight change visualization
        for index in range(len(old_parameters.keys())):
            if index == 0 or index == 2 or index == 4:
                key = list(old_parameters.keys())[index]
                old_weights = old_parameters[key]
                new_weights = new_parameters[key]
                norm_diff = numpy.linalg.norm(new_weights - old_weights)
                diff_matrix = abs(new_weights - old_weights)
                diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0],
                                                    norm='max',
                                                    axis=0)
                if print_visual:
                    axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0],
                                               cmap='Greens',
                                               interpolation='nearest')

        # This allows the plots to update as the model trains
        if print_visual:
            plt.ion()
            plt.show()
            plt.pause(0.001)

        num_frames += logs["num_frames"]
        update += 1

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value", "policy_loss", "value_loss", "grad_norm"
            ]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"], logs["grad_norm"]
            ]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {
                "num_frames": num_frames,
                "update": update,
                "model_state": acmodel.state_dict(),
                "optimizer_state": algo.optimizer.state_dict()
            }
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
Exemplo n.º 6
0
                    logs["value"], logs["policy_loss"], logs["value_loss"],
                    logs["grad_norm"]))
        if args.tb:
            writer.add_scalar("frames", num_frames, num_frames)
            writer.add_scalar("FPS", fps, num_frames)
            writer.add_scalar("duration", total_ellapsed_time, num_frames)
            for key, value in return_per_episode.items():
                writer.add_scalar("return_" + key, value, num_frames)
            for key, value in rreturn_per_episode.items():
                writer.add_scalar("rreturn_" + key, value, num_frames)
            for key, value in num_frames_per_episode.items():
                writer.add_scalar("num_frames_" + key, value, num_frames)
            writer.add_scalar("entropy", logs["entropy"], num_frames)
            writer.add_scalar("value", logs["value"], num_frames)
            writer.add_scalar("policy_loss", logs["policy_loss"], num_frames)
            writer.add_scalar("value_loss", logs["value_loss"], num_frames)
            writer.add_scalar("grad_norm", logs["grad_norm"], num_frames)

    # Save obss preprocessor, vocabulary and model

    if args.save_interval > 0 and i % args.save_interval == 0:
        obss_preprocessor.vocab.save()

        if torch.cuda.is_available():
            acmodel.cpu()
        utils.save_model(acmodel, run_dir)
        status = {"num_frames": num_frames, "i": i}
        utils.save_status(status, run_dir)
        logger.info("Model successfully saved")
        if torch.cuda.is_available():
            acmodel.cuda()
def main(raw_args=None):

    # Parse arguments
    parser = argparse.ArgumentParser()

    ## General parameters
    parser.add_argument("--algo",
                        required=True,
                        help="algorithm to use: a2c | ppo | ipo (REQUIRED)")
    parser.add_argument("--domain1",
                        required=True,
                        help="name of the first domain to train on (REQUIRED)")
    parser.add_argument(
        "--domain2",
        required=True,
        help="name of the second domain to train on (REQUIRED)")
    parser.add_argument(
        "--p1",
        required=True,
        type=float,
        help="Proportion of training environments from first domain (REQUIRED)"
    )
    parser.add_argument("--model", required=True, help="name of the model")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval",
                        type=int,
                        default=1,
                        help="number of updates between two logs (default: 1)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=10,
        help=
        "number of updates between two saves (default: 10, 0 means no saving)")
    parser.add_argument("--procs",
                        type=int,
                        default=16,
                        help="number of processes (default: 16)")
    parser.add_argument("--frames",
                        type=int,
                        default=10**7,
                        help="number of frames of training (default: 1e7)")

    ## Parameters for main algorithm
    parser.add_argument("--epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="batch size for PPO (default: 256)")
    parser.add_argument(
        "--frames-per-proc",
        type=int,
        default=None,
        help=
        "number of frames per process before update (default: 5 for A2C and 128 for PPO)"
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="learning rate (default: 0.001)")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.95,
        help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)"
    )
    parser.add_argument("--entropy-coef",
                        type=float,
                        default=0.01,
                        help="entropy term coefficient (default: 0.01)")
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument(
        "--optim-eps",
        type=float,
        default=1e-8,
        help="Adam and RMSprop optimizer epsilon (default: 1e-8)")
    parser.add_argument("--optim-alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha (default: 0.99)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument(
        "--recurrence",
        type=int,
        default=1,
        help=
        "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory."
    )
    parser.add_argument("--text",
                        action="store_true",
                        default=False,
                        help="add a GRU to the model to handle text input")

    args = parser.parse_args(raw_args)

    args.mem = args.recurrence > 1

    # Check PyTorch version
    if (torch.__version__ != '1.2.0'):
        raise ValueError(
            "PyTorch version must be 1.2.0 (see README). Your version is {}.".
            format(torch.__version__))

    if args.mem:
        raise ValueError("Policies with memory not supported.")

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = args.model

    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    torch.backends.cudnn.deterministic = True
    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")

    # Load environments from different domains
    domain1 = args.domain1  # e.g., 'MiniGrid-ColoredKeysRed-v0'
    domain2 = args.domain2  # e.g., 'MiniGrid-ColoredKeysYellow-v0'

    p1 = args.p1  # Proportion of environments from domain1

    num_envs_total = args.procs  # Total number of environments
    num_domain1 = math.ceil(
        p1 * num_envs_total)  # Number of environments in domain1
    num_domain2 = num_envs_total - num_domain1  # Number of environments in domain2

    # Environments from domain1
    envs1 = []
    for i in range(num_domain1):
        envs1.append(utils.make_env(domain1, args.seed + 10000 * i))

    # Environments from domain2
    envs2 = []
    for i in range(num_domain2):
        envs2.append(utils.make_env(domain2, args.seed + 10000 * i))

    # All environments
    envs = envs1 + envs2

    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    if args.algo == "ipo":
        # Load model for IPO game
        acmodel = ACModel_average(obs_space, envs[0].action_space, args.mem,
                                  args.text)
        if "model_state" in status:
            acmodel.load_state_dict(status["model_state"])
        acmodel.to(device)
        txt_logger.info("Model loaded\n")
        txt_logger.info("{}\n".format(acmodel))

    else:
        # Load model (for standard PPO or A2C)
        acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
        if "model_state" in status:
            acmodel.load_state_dict(status["model_state"])
        acmodel.to(device)
        txt_logger.info("Model loaded\n")
        txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps,
                                preprocess_obss)
        if "optimizer_state" in status:
            algo.optimizer.load_state_dict(status["optimizer_state"])
            txt_logger.info("Optimizer loaded\n")

    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs,
                                args.batch_size, preprocess_obss)

        if "optimizer_state" in status:
            algo.optimizer.load_state_dict(status["optimizer_state"])
            txt_logger.info("Optimizer loaded\n")

    elif args.algo == "ipo":
        # One algo per domain. These have different envivonments, but shared acmodel
        algo1 = torch_ac.IPOAlgo(
            envs1, acmodel, 1, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)

        algo2 = torch_ac.IPOAlgo(
            envs2, acmodel, 2, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)

        if "optimizer_state1" in status:
            algo1.optimizer.load_state_dict(status["optimizer_state1"])
            txt_logger.info("Optimizer 1 loaded\n")
        if "optimizer_state2" in status:
            algo2.optimizer.load_state_dict(status["optimizer_state2"])
            txt_logger.info("Optimizer 2 loaded\n")

    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        update_start_time = time.time()

        if args.algo == "ipo":

            # Standard method

            # Collect experiences on first domain
            exps1, logs_exps1 = algo1.collect_experiences()

            # Update params of model corresponding to first domain
            logs_algo1 = algo1.update_parameters(exps1)

            # Collect experiences on second domain
            exps2, logs_exps2 = algo2.collect_experiences()

            # Update params of model corresponding to second domain
            logs_algo2 = algo2.update_parameters(exps2)

            # Update end time
            update_end_time = time.time()

            # Combine logs
            logs_exps = {
                'return_per_episode':
                logs_exps1["return_per_episode"] +
                logs_exps2["return_per_episode"],
                'reshaped_return_per_episode':
                logs_exps1["reshaped_return_per_episode"] +
                logs_exps2["reshaped_return_per_episode"],
                'num_frames_per_episode':
                logs_exps1["num_frames_per_episode"] +
                logs_exps2["num_frames_per_episode"],
                'num_frames':
                logs_exps1["num_frames"] + logs_exps2["num_frames"]
            }

            logs_algo = {
                'entropy':
                (num_domain1 * logs_algo1["entropy"] +
                 num_domain2 * logs_algo2["entropy"]) / num_envs_total,
                'value': (num_domain1 * logs_algo1["value"] +
                          num_domain2 * logs_algo2["value"]) / num_envs_total,
                'policy_loss':
                (num_domain1 * logs_algo1["policy_loss"] +
                 num_domain2 * logs_algo2["policy_loss"]) / num_envs_total,
                'value_loss':
                (num_domain1 * logs_algo1["value_loss"] +
                 num_domain2 * logs_algo2["value_loss"]) / num_envs_total,
                'grad_norm':
                (num_domain1 * logs_algo1["grad_norm"] +
                 num_domain2 * logs_algo2["grad_norm"]) / num_envs_total
            }

            logs = {**logs_exps, **logs_algo}
            num_frames += logs["num_frames"]

        else:
            exps, logs1 = algo.collect_experiences()
            logs2 = algo.update_parameters(exps)
            logs = {**logs1, **logs2}
            update_end_time = time.time()
            num_frames += logs["num_frames"]

        update += 1

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value", "policy_loss", "value_loss", "grad_norm"
            ]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"], logs["grad_norm"]
            ]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            # header += ["debug_last_env_reward"]
            # data += [logs["debug_last_env_reward"]]

            header += ["total_loss"]
            data += [
                logs["policy_loss"] - args.entropy_coef * logs["entropy"] +
                args.value_loss_coef * logs["value_loss"]
            ]

            if status["num_frames"] == 0:
                csv_logger.writerow(header)

            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:

            if args.algo == "ipo":
                status = {
                    "num_frames": num_frames,
                    "update": update,
                    "model_state": acmodel.state_dict(),
                    "optimizer_state1": algo1.optimizer.state_dict(),
                    "optimizer_state2": algo2.optimizer.state_dict()
                }
            else:
                status = {
                    "num_frames": num_frames,
                    "update": update,
                    "model_state": acmodel.state_dict(),
                    "optimizer_state": algo.optimizer.state_dict()
                }

            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
Exemplo n.º 8
0
def run(full_args: Namespace, return_models: bool = False):
    if sys.argv[0].startswith("train"):
        import os
        full_args.out_dir = os.path.dirname(sys.argv[1])

    args = full_args.main
    agent_args = full_args.agent
    model_args = full_args.model
    extra_logs = getattr(full_args, "extra_logs", None)
    main_r_key = getattr(full_args, "main_r_key", None)

    if args.seed == 0:
        args.seed = full_args.run_id + 1
    max_eprews = args.max_eprews
    max_eprews_window = getattr(args, "max_eprews_window", 1)

    post_process_args(agent_args)
    post_process_args(model_args)

    model_dir = getattr(args, "model_dir", full_args.out_dir)
    print(model_dir)

    # ==============================================================================================
    # @ torc_rl repo original

    # Define logger, CSV writer and Tensorboard writer

    logger = utils.get_logger(model_dir)
    csv_file, csv_writer = utils.get_csv_writer(model_dir)
    tb_writer = None
    if args.tb:
        from tensorboardX import SummaryWriter
        tb_writer = SummaryWriter(model_dir)

    # Log command and all script arguments

    logger.info("{}\n".format(" ".join(sys.argv)))
    logger.info("{}\n".format(args))

    # ==============================================================================================
    # Set seed for all randomness sources
    utils.seed(args.seed)

    # ==============================================================================================
    # Generate environments

    envs = []

    # Get env wrappers - must be a list of elements
    wrapper_method = getattr(full_args.env_cfg, "wrapper", None)
    if wrapper_method is None:

        def idem(x):
            return x

        env_wrapper = idem
    else:
        env_wrappers = [getattr(gym_wrappers, w_p) for w_p in wrapper_method]

        def env_wrapp(w_env):
            for wrapper in env_wrappers[::-1]:
                w_env = wrapper(w_env)
            return w_env

        env_wrapper = env_wrapp

    actual_procs = getattr(args, "actual_procs", None)
    no_actions = getattr(full_args.env_cfg, "no_actions", 6)

    if actual_procs:
        # Split envs in chunks
        no_envs = args.procs
        envs, chunk_size = get_envs(full_args,
                                    env_wrapper,
                                    no_envs,
                                    n_actions=no_actions)
        first_env = envs[0][0]
        print(
            f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master"
        )
    else:
        for i in range(args.procs):
            env = env_wrapper(gym.make(args.env))
            env.max_steps = full_args.env_cfg.max_episode_steps

            env.seed(args.seed + 10000 * i)
            envs.append(env)
        first_env = envs[0]

    # Generate evaluation envs
    eval_envs = []
    eval_episodes = getattr(full_args.env_cfg, "eval_episodes", 0)
    if full_args.env_cfg.no_eval_envs > 0:
        no_envs = full_args.env_cfg.no_eval_envs
        eval_envs, chunk_size = get_envs(full_args,
                                         env_wrapper,
                                         no_envs,
                                         n_actions=no_actions)

    # Define obss preprocessor
    max_image_value = full_args.env_cfg.max_image_value
    normalize_img = full_args.env_cfg.normalize
    permute = getattr(full_args.env_cfg, "permute", False)
    obss_preprocessor = getattr(full_args.env_cfg, "obss_preprocessor", None)
    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        args.env,
        first_env.observation_space,
        model_dir,
        max_image_value=max_image_value,
        normalize=normalize_img,
        permute=permute,
        type=obss_preprocessor)

    first_obs = first_env.reset()
    if "state" in first_obs:
        full_state_size = first_obs["state"].shape

        # Add full size shape
        add_to_cfg(full_args, MAIN_CFG_ARGS, "full_state_size",
                   full_state_size)

    if "position" in first_obs:
        position_size = first_obs["position"].shape

        # Add full size shape
        add_to_cfg(full_args, MAIN_CFG_ARGS, "position_size", position_size)

    # Add the width and height of environment for position estimation
    model_args.width = first_env.unwrapped.width
    model_args.height = first_env.unwrapped.height

    # ==============================================================================================
    # Load training status
    try:
        status = utils.load_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}

    saver = utils.SaveData(model_dir,
                           save_best=args.save_best,
                           save_all=args.save_all)
    model, agent_data, other_data = None, dict(), None
    try:
        # Continue from last point
        model, agent_data, other_data = saver.load_training_data(best=False)
        logger.info("Training data exists & loaded successfully\n")
    except OSError:
        logger.info("Could not load training data\n")

    # ==============================================================================================
    # Load Model

    if model is None:
        model = get_model(model_args,
                          obs_space,
                          first_env.action_space,
                          use_memory=model_args.mem)
        logger.info(f"Model [{model_args.name}] successfully created\n")

        # Print Model info
        logger.info("{}\n".format(model))

    if torch.cuda.is_available():
        model.cuda()
    logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

    # ==============================================================================================
    # Load Agent

    algo = get_agent(full_args.agent,
                     envs,
                     model,
                     agent_data,
                     preprocess_obss=preprocess_obss,
                     reshape_reward=None,
                     eval_envs=eval_envs,
                     eval_episodes=eval_episodes)

    has_evaluator = hasattr(algo,
                            "evaluate") and full_args.env_cfg.no_eval_envs > 0

    if return_models:
        return algo, model, envs, saver

    # ==============================================================================================
    # Train model

    prev_rewards = []
    crt_eprew = 0
    if "eprew" in other_data:
        crt_eprew = other_data["eprew"]
    num_frames = status["num_frames"]
    total_start_time = time.time()
    update = status["update"]
    update_start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        logs = algo.update_parameters()

        num_frames += logs["num_frames"]
        update += 1

        if update % args.eval_interval == 0 and has_evaluator:
            eval_logs = algo.evaluate(eval_key=main_r_key)
            logs.update(eval_logs)

        prev_start_time = update_start_time
        update_start_time = time.time()

        # Print logs
        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_start_time - prev_start_time)
            duration = int(time.time() - total_start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss"]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"]
            ]
            header += ["grad_norm"]
            data += [logs["grad_norm"]]

            # add log fields that are not in the standard log format (for example value_int)
            extra_fields = extra_log_fields(header, list(logs.keys()))
            header.extend(extra_fields)
            data += [logs[field] for field in extra_fields]

            # print to stdout the standard log fields + fields required in config
            keys_format, printable_data = print_keys(header, data, extra_logs)
            logger.info(keys_format.format(*printable_data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_writer.writerow(header)
            csv_writer.writerow(data)
            csv_file.flush()

            if args.tb:
                for field, value in zip(header, data):
                    tb_writer.add_scalar(field, value, num_frames)

            status = {"num_frames": num_frames, "update": update}

            if main_r_key is None:
                crt_eprew = list(rreturn_per_episode.values())[0]
                prev_rewards.append(crt_eprew)
            else:
                crt_eprew = logs[main_r_key]
                prev_rewards.append(logs[main_r_key])

        # -- Save vocabulary and model

        if args.save_interval > 0 and update % args.save_interval == 0:
            preprocess_obss.vocab.save()

            saver.save_training_data(model, algo.get_save_data(), crt_eprew)

            logger.info("Model successfully saved")

            utils.save_status(status, model_dir)

        check_rew = np.mean(prev_rewards[-max_eprews_window:])
        if len(prev_rewards) > max_eprews_window and check_rew > max_eprews:
            print(
                f"Reached mean return {max_eprews} for a window of {max_eprews_window} steps"
            )
            exit()
Exemplo n.º 9
0
def train_i2a_model(environment_class,  # name of the environment to train on (REQUIRED)
                    environment_model_name,  # class
                    algorithm,
                    imagination_steps,
                    seed=1,  # random seed (default: 1)
                    procs=16,  # number of processes (default: 16)
                    frames=10 ** 7,  # number of frames of training (default: 10e7)
                    log_interval=1,  # number of updates between two logs (default: 1)
                    save_interval=10,  # number of updates between two saves (default: 0, 0 means no saving)
                    frames_per_proc=None,  # number of frames per process before update (default: 5 for A2C and 128 for PPO)
                    discount=0.99,  # discount factor (default: 0.99)
                    lr=7e-4,  # learning rate for optimizers (default: 7e-4)
                    gae_lambda=0.95,  # lambda coefficient in GAE formula (default: 0.95, 1 means no gae)
                    entropy_coef=0.01,  # entropy term coefficient (default: 0.01)
                    value_loss_coef=0.5,  # value loss term coefficient (default: 0.5)
                    max_grad_norm=0.5,  # maximum norm of gradient (default: 0.5)
                    recurrence=1,  # number of steps the gradient is propagated back in time (default: 1)
                    optim_eps=1e-5,  # Adam and RMSprop optimizer epsilon (default: 1e-5)
                    optim_alpha=0.99,  # RMSprop optimizer apha (default: 0.99)
                    clip_eps=0.2,  # clipping epsilon for PPO (default: 0.2)
                    epochs=4,  # number of epochs for PPO (default: 4)
                    batch_size=256,  # batch size for PPO (default: 256)
                    no_instr=False,  # don't use instructions in the model
                    no_mem=False,  # don't use memory in the model
                    note=None,  # name suffix
                    tensorboard=True):
    saved_arguments = locals()

    date_suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    note = note + "_" if note else ""

    model_name = "I2A-{}_{}{}_s{}_{}".format(imagination_steps, note, environment_name(environment_class), seed, date_suffix)
    model_dir = utils.get_model_dir(model_name)

    # Define logger, CSV writer and Tensorboard writer
    logger = utils.get_logger(model_dir)
    csv_file, csv_writer = utils.get_csv_writer(model_dir)

    if tensorboard:
        from tensorboardX import SummaryWriter
        tb_writer = SummaryWriter(model_dir)

    # Log command and all script arguments
    logger.info("{}\n".format(saved_arguments))

    # Set seed for all randomness sources
    utils.seed(seed)

    # Load training status
    try:
        status = utils.load_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}

    # Define actor-critic model

    num_frames = status["num_frames"]
    total_start_time = time.time()
    update = status["update"]

    environment_model = utils.load_model(utils.get_model_dir(environment_model_name))
    i2a_model = I2AModel(environment_class, environment_model, imagination_steps)

    algorithm.load_acmodel(i2a_model)

    logger.info("Using environment model: {}\n".format(environment_model_name))
    logger.info("{}\n".format(environment_model))

    logger.info("Agent architecture:\n")
    logger.info("{}\n".format(i2a_model))

    while num_frames < frames:
        # Update model parameters

        update_start_time = time.time()
        logs = algorithm.update_parameters()
        update_end_time = time.time()

        num_frames += logs["num_frames"]
        update += 1

        # Print logs

        if update % log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - total_start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm", "distillation_loss"]
            data += [logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"], logs["distillation_loss"]]

            logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:x̄σmM {:.2f} {:.2f} {:.2f} {:.2f} | F:x̄σmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f} | dL {:.3f}"
                    .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_writer.writerow(header)
            csv_writer.writerow(data)
            csv_file.flush()

            if tensorboard:
                for field, value in zip(header, data):
                    tb_writer.add_scalar(field, value, num_frames)

            status = {"num_frames": num_frames, "update": update}
            utils.save_status(status, model_dir)

        # Save vocabulary and model

        if save_interval > 0 and update % save_interval == 0:
            utils.save_model(algorithm.acmodel, model_dir)
            logger.info("Model successfully saved")
Exemplo n.º 10
0
        txt_logger.info(
            "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
            .format(*data))

        header += ["return_" + key for key in return_per_episode.keys()]
        data += return_per_episode.values()

        if status["num_frames"] == 0:
            csv_logger.writerow(header)
        csv_logger.writerow(data)
        csv_file.flush()

        # save to tensorboard
        for field, value in zip(header, data):
            tb_writer.add_scalar(field, value)
            experiment.log_metric(field, value, epoch=update)
    # Save status

    if args.save_interval > 0 and update % args.save_interval == 0:
        status = {
            "num_frames": num_frames,
            "update": update,
            "model_state": acmodel.state_dict(),
            "optimizer_state": algo.optimizer.state_dict()
        }
        if hasattr(preprocess_obss, "vocab"):
            status["vocab"] = preprocess_obss.vocab.vocab
        utils.save_status(status, model_dir, args.seed)
        txt_logger.info("Status saved")
Exemplo n.º 11
0
def run(full_args: Namespace) -> None:
    # import torch.multiprocessing as mp
    # mp.set_start_method('spawn')

    args = full_args.main
    agent_args = full_args.agent
    model_args = full_args.model
    env_args = full_args.env_cfg
    extra_logs = getattr(full_args, "extra_logs", None)

    if args.seed == 0:
        args.seed = full_args.run_id + 1
    max_eprews = args.max_eprews

    post_process_args(agent_args)
    post_process_args(model_args)

    model_dir = getattr(args, "model_dir", full_args.out_dir)
    print(model_dir)

    # ==============================================================================================
    # @ torc_rl repo original

    # Define logger, CSV writer and Tensorboard writer

    logger = utils.get_logger(model_dir)
    csv_file, csv_writer = utils.get_csv_writer(model_dir)
    tb_writer = None
    if args.tb:
        from tensorboardX import SummaryWriter
        tb_writer = SummaryWriter(model_dir)

    # Log command and all script arguments

    logger.info("{}\n".format(" ".join(sys.argv)))
    logger.info("{}\n".format(args))

    # ==============================================================================================
    # Set seed for all randomness sources
    utils.seed(args.seed)

    # ==============================================================================================
    # Generate environments

    envs = []

    # Get environment wrapper
    wrapper_method = getattr(full_args.env_cfg, "wrapper", None)
    if wrapper_method is None:

        def idem(x):
            return x

        env_wrapper = idem
    else:
        env_wrappers = [getattr(environment, w_p) for w_p in wrapper_method]

        def env_wrapp(w_env):
            for wrapper in env_wrappers[::-1]:
                w_env = wrapper(w_env)
            return w_env

        env_wrapper = env_wrapp

    actual_procs = getattr(args, "actual_procs", None)
    master_make_envs = getattr(full_args.env_cfg, "master_make_envs", False)

    if actual_procs:
        # Split envs in chunks
        no_envs = args.procs
        envs, chunk_size = get_envs(full_args,
                                    env_wrapper,
                                    no_envs,
                                    master_make=master_make_envs)
        first_env = envs[0][0]
        print(
            f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master"
        )
    else:
        for i in range(args.procs):
            env = env_wrapper(gym.make(args.env))
            env.max_steps = full_args.env_cfg.max_episode_steps
            env.no_stacked_frames = full_args.env_cfg.no_stacked_frames

            env.seed(args.seed + 10000 * i)
            envs.append(env)
        first_env = envs[0]

    # Generate evaluation envs
    eval_envs = []
    if full_args.env_cfg.no_eval_envs > 0:
        no_envs = full_args.env_cfg.no_eval_envs
        eval_envs, chunk_size = get_envs(full_args,
                                         env_wrapper,
                                         no_envs,
                                         master_make=master_make_envs)

    # Define obss preprocessor
    max_image_value = full_args.env_cfg.max_image_value
    normalize_img = full_args.env_cfg.normalize
    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        args.env,
        first_env.observation_space,
        model_dir,
        max_image_value=max_image_value,
        normalize=normalize_img)

    # ==============================================================================================
    # Load training status
    try:
        status = utils.load_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}

    saver = utils.SaveData(model_dir,
                           save_best=args.save_best,
                           save_all=args.save_all)
    model, agent_data, other_data = None, dict(), None
    try:
        # Continue from last point
        model, agent_data, other_data = saver.load_training_data(best=False)
        logger.info("Training data exists & loaded successfully\n")
    except OSError:
        logger.info("Could not load training data\n")

    # ==============================================================================================
    # Load Model

    if model is None:
        model = get_model(model_args,
                          obs_space,
                          first_env.action_space,
                          use_memory=model_args.use_memory,
                          no_stacked_frames=env_args.no_stacked_frames)
        logger.info(f"Model [{model_args.name}] successfully created\n")

        # Print Model info
        logger.info("{}\n".format(model))

    if torch.cuda.is_available():
        model.cuda()
    logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

    # ==============================================================================================
    # Load Agent

    algo = get_agent(full_args.agent,
                     envs,
                     model,
                     agent_data,
                     preprocess_obss=preprocess_obss,
                     reshape_reward=None,
                     eval_envs=eval_envs)

    has_evaluator = hasattr(algo,
                            "evaluate") and full_args.env_cfg.no_eval_envs > 0

    # ==============================================================================================
    # Train model

    crt_eprew = 0
    if "eprew" in other_data:
        crt_eprew = other_data["eprew"]
    num_frames = status["num_frames"]
    total_start_time = time.time()
    update = status["update"]
    update_start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        logs = algo.update_parameters()

        num_frames += logs["num_frames"]
        update += 1

        if has_evaluator:
            if update % args.eval_interval == 0:
                algo.evaluate()

        prev_start_time = update_start_time
        update_start_time = time.time()

        # Print logs
        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_start_time - prev_start_time)
            duration = int(time.time() - total_start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss"]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"]
            ]
            header += ["grad_norm"]
            data += [logs["grad_norm"]]

            # add log fields that are not in the standard log format (for example value_int)
            extra_fields = extra_log_fields(header, list(logs.keys()))
            header.extend(extra_fields)
            data += [logs[field] for field in extra_fields]

            # print to stdout the standard log fields + fields required in config
            keys_format, printable_data = print_keys(header, data, extra_logs)
            logger.info(keys_format.format(*printable_data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_writer.writerow(header)
            csv_writer.writerow(data)
            csv_file.flush()

            if args.tb:
                for field, value in zip(header, data):
                    tb_writer.add_scalar(field, value, num_frames)

            status = {"num_frames": num_frames, "update": update}

            crt_eprew = list(rreturn_per_episode.values())[0]

        # -- Save vocabulary and model

        if args.save_interval > 0 and update % args.save_interval == 0:
            # preprocess_obss.vocab.save()

            saver.save_training_data(model, algo.get_save_data(), crt_eprew)

            logger.info("Model successfully saved")

            utils.save_status(status, model_dir)

        if crt_eprew > max_eprews != 0:
            print("Reached max return 0.93")
            exit()
def tuner(icm_lr, reward_weighting, normalise_rewards, args):
    import argparse
    import datetime
    import torch
    import torch_ac
    import tensorboardX
    import sys
    import numpy as np
    from model import ACModel
    from .a2c import A2CAlgo

    # from .ppo import PPOAlgo

    frames_to_visualise = 200
    # Parse arguments

    args.mem = args.recurrence > 1

    def make_exploration_heatmap(args, plot_title):
        import numpy as np
        import matplotlib.pyplot as plt

        visitation_counts = np.load(
            f"{args.model}_visitation_counts.npy", allow_pickle=True
        )
        plot_title = str(np.count_nonzero(visitation_counts)) + args.model
        plt.imshow(np.log(visitation_counts))
        plt.colorbar()
        plt.title(plot_title)
        plt.savefig(f"{plot_title}_visitation_counts.png")

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"
    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    utils.seed(args.seed)

    # Set device

    device = "cpu"  # torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")
    # Load environments

    envs = []

    for i in range(16):
        an_env = utils.make_env(
            args.env, int(args.frames_before_reset), int(args.environment_seed)
        )
        envs.append(an_env)
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    # adapted from impact driven RL
    from .models import AutoencoderWithUncertainty

    autoencoder = AutoencoderWithUncertainty(observation_shape=(7, 7, 3)).to(device)

    autoencoder_opt = torch.optim.Adam(
        autoencoder.parameters(), lr=icm_lr, weight_decay=0
    )
    if args.algo == "a2c":
        algo = A2CAlgo(
            envs,
            acmodel,
            autoencoder,
            autoencoder_opt,
            args.uncertainty,
            args.noisy_tv,
            args.curiosity,
            args.randomise_env,
            args.uncertainty_budget,
            args.environment_seed,
            reward_weighting,
            normalise_rewards,
            args.frames_before_reset,
            device,
            args.frames_per_proc,
            args.discount,
            args.lr,
            args.gae_lambda,
            args.entropy_coef,
            args.value_loss_coef,
            args.max_grad_norm,
            args.recurrence,
            args.optim_alpha,
            args.optim_eps,
            preprocess_obss,
            None,
            args.random_action,
        )
    elif args.algo == "ppo":
        algo = PPOAlgo(
            envs,
            acmodel,
            autoencoder,
            autoencoder_opt,
            args.uncertainty,
            args.noisy_tv,
            args.curiosity,
            args.randomise_env,
            args.uncertainty_budget,
            args.environment_seed,
            reward_weighting,
            normalise_rewards,
            device,
            args.frames_per_proc,
            args.discount,
            args.lr,
            args.gae_lambda,
            args.entropy_coef,
            args.value_loss_coef,
            args.max_grad_norm,
            args.recurrence,
            args.optim_eps,
            args.clip_eps,
            args.epochs,
            args.batch_size,
            preprocess_obss,
        )

    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        update_start_time = time.time()
        exps, logs1 = algo.collect_experiences()
        logs2 = algo.update_parameters(exps)
        logs = {**logs1, **logs2}
        update_end_time = time.time()

        num_frames += logs["num_frames"]
        update += 1

        log_to_wandb(logs, start_time, update_start_time, update_end_time)

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])
            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += [
                "intrinsic_rewards",
                "uncertainties",
                "novel_states_visited",
                "entropy",
                "value",
                "policy_loss",
                "value_loss",
                "grad_norm",
            ]
            data += [
                logs["intrinsic_rewards"].mean().item(),
                logs["uncertainties"].mean().item(),
                logs["novel_states_visited"].mean().item(),
                logs["entropy"],
                logs["value"],
                logs["policy_loss"],
                logs["value_loss"],
                logs["grad_norm"],
            ]
            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f}".format(
                    *data
                )
            )
        # Save status
        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {
                "num_frames": num_frames,
                "update": update,
                "model_state": acmodel.state_dict(),
                "optimizer_state": algo.optimizer.state_dict(),
            }
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
    return
Exemplo n.º 13
0
def main(env_name, seed, meta, load_id, procs, fullObs, POfullObs, frames,
         log_interval, save_interval, experimental, _run):
    """Main function.

    Called by sacred with arguments filled in from default.yaml or command line.
    """

    # Make a bunch of experimental options available everywhere for easy change
    for cfg in experimental:
        setattr(exp_config, cfg, experimental[cfg])

    cuda = torch.cuda.is_available()
    device = torch.device("cuda" if cuda else "cpu")

    model_name = meta['label'] + "_{}".format(_run._id)
    model_dir = utils.get_model_dir(model_name)

    # Define logger, CSV writer and Tensorboard writer
    logger = utils.get_logger(model_dir)
    csv_file, csv_writer = utils.get_csv_writer(model_dir)

    # Log command and all script arguments
    logger.info("{}\n".format(" ".join(sys.argv)))

    # Set seed for all randomness sources
    utils.seed(seed)

    # Generate environments
    envs = []
    for i in range(procs):
        env = gym.make(env_name)
        env.seed(seed + 10000 * i)
        if fullObs:
            env = gym_minigrid.wrappers.FullyObsWrapper(env)
        elif POfullObs:
            env = gym_minigrid.wrappers.PartialObsFullGridWrapper(env)
        envs.append(env)

    # Define obss preprocessor
    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        env_name, envs[0].observation_space, model_dir)

    # Load training status
    if load_id is not None:
        model1, model2, status = utils.load_status_and_model_from_db(
            db_uri, db_name, model_dir, load_id)
        if model1 is not None:
            model1 = model1.to(device)
        model2 = model2.to(device)
        acmodels = model1, model2
        current_cycle_count, _ = scheduling(status['num_frames'])

        logger.info("Model successfully loaded\n")
        logger.info("Loaded status: {}".format(status))
    else:
        # First one is pi_old, second one is pi_train
        acmodels = [None, create_model(obs_space, envs)]
        status = {"num_frames": 0, "update": 0}
        current_cycle_count = 0

        logger.info("Model successfully created\n")
    logger.info("{}\n".format(acmodels[0]))

    logger.info("Used device: {}\n".format(device))

    # Define actor-critic algo
    algo = create_algo(envs, *acmodels, preprocess_obss)

    # Train model
    num_frames = status["num_frames"]
    total_start_time = time.time()
    update = status["update"]
    # current_cycle_count = 0

    while num_frames < frames:
        # Update model parameters

        cycle_count, alpha = scheduling(num_frames)

        if cycle_count != current_cycle_count:
            current_cycle_count = cycle_count
            switch_training_model(algo, obs_space, envs)
            logger.info("Switched training model")

        update_start_time = time.time()
        logs = algo.update_parameters(alpha)
        update_end_time = time.time()

        num_frames += logs["num_frames"]
        update += 1

        # Print logs
        if update % log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - total_start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value_train", "value_old", "policy_loss_train",
                "policy_loss_old", "value_loss_train", "value_loss_old"
            ]
            data += [
                logs["entropy"], logs["value_train"], logs["value_old"],
                logs["policy_loss_train"], logs["policy_loss_old"],
                logs["value_loss_train"], logs["value_loss_old"]
            ]
            header += [
                "grad_norm_train", "grad_norm_old", "alpha", "reg_loss_policy",
                "reg_loss_value"
            ]
            data += [
                logs["grad_norm_train"], logs["grad_norm_old"], alpha,
                logs["reg_loss_policy"], logs["reg_loss_value"]
            ]

            logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V:to {:.3f} {:.3f} "
                .format(*data[:15]))
            logger.info(
                "pL:to {:.3f} {:.3f} | vL:to {:.3f} {:.3f} | ∇:to {:.3f} {:.3f} | alpha {:.2f} | rLpv {:.3f} {:.3f}\n"
                .format(*data[15:]))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_writer.writerow(header)
            csv_writer.writerow(data)
            csv_file.flush()

            for head, dat in zip(header, data):
                _run.log_scalar(head, dat, num_frames)

            status = {"num_frames": num_frames, "update": update}

        # Save vocabulary and model
        if save_interval > 0 and update % save_interval == 0:
            preprocess_obss.vocab.save()

            utils.save_model(algo.pi_old, algo.pi_train, model_dir)
            logger.info("Model successfully saved")
            utils.save_status(status, model_dir)

    utils.save_model_to_db(algo.pi_old, algo.pi_train, model_dir, num_frames,
                           _run)
    utils.save_status_to_db({
        "num_frames": num_frames,
        "update": update
    }, model_dir, num_frames, _run)