Пример #1
0
            args.model, envs[0].observation_space, args.pretrained_model)
    else:
        obss_preprocessor = utils.ObssPreprocessor(args.model,
                                                   envs[0].observation_space,
                                                   args.pretrained_model)

# Define actor-critic model
acmodel = utils.load_model(args.model, raise_not_found=False)

acmodel = ACModelGNN(obss_preprocessor.obs_space, envs[0].action_space,
                     args.image_dim, args.memory_dim, args.instr_dim,
                     not args.no_instr, args.instr_arch, not args.no_mem,
                     args.arch)

obss_preprocessor.vocab.save()
utils.save_model(acmodel, args.model)

# if torch.cuda.is_available():
#     acmodel.cuda()

# Define actor-critic algo

reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward
if args.algo == "ppo":
    algo = babyai.rl.PPOAlgoGNN(envs, acmodel, args.frames_per_proc,
                                args.discount, args.lr, args.beta1, args.beta2,
                                args.gae_lambda, args.entropy_coef,
                                args.value_loss_coef, args.max_grad_norm,
                                args.recurrence, args.optim_eps, args.clip_eps,
                                args.ppo_epochs, args.batch_size,
                                obss_preprocessor, reshape_reward)
Пример #2
0
    def __init__(
        self,
        args,
    ):
        self.args = args

        utils.seed(self.args.seed)

        # args.env is a list when training on multiple environments
        if getattr(args, 'multi_env', None):
            self.env = [gym.make(item) for item in args.multi_env]

            self.train_demos = []
            for demos, episodes in zip(args.multi_demos, args.multi_episodes):
                demos_path = utils.get_demos_path(demos,
                                                  None,
                                                  None,
                                                  valid=False)
                logger.info('loading {} of {} demos'.format(episodes, demos))
                train_demos = utils.load_demos(demos_path)
                logger.info('loaded demos')
                if episodes > len(train_demos):
                    raise ValueError(
                        "there are only {} train demos in {}".format(
                            len(train_demos), demos))
                self.train_demos.extend(train_demos[:episodes])
                logger.info('So far, {} demos loaded'.format(
                    len(self.train_demos)))

            self.val_demos = []
            for demos, episodes in zip(args.multi_demos, [args.val_episodes] *
                                       len(args.multi_demos)):
                demos_path_valid = utils.get_demos_path(demos,
                                                        None,
                                                        None,
                                                        valid=True)
                logger.info('loading {} of {} valid demos'.format(
                    episodes, demos))
                valid_demos = utils.load_demos(demos_path_valid)
                logger.info('loaded demos')
                if episodes > len(valid_demos):
                    logger.info(
                        'Using all the available {} demos to evaluate valid. accuracy'
                        .format(len(valid_demos)))
                self.val_demos.extend(valid_demos[:episodes])
                logger.info('So far, {} valid demos loaded'.format(
                    len(self.val_demos)))

            logger.info('Loaded all demos')

            observation_space = self.env[0].observation_space
            action_space = self.env[0].action_space

        else:
            self.env = gym.make(self.args.env)

            demos_path = utils.get_demos_path(args.demos,
                                              args.env,
                                              args.demos_origin,
                                              valid=False)
            demos_path_valid = utils.get_demos_path(args.demos,
                                                    args.env,
                                                    args.demos_origin,
                                                    valid=True)

            logger.info('loading demos')
            self.train_demos = utils.load_demos(demos_path)
            logger.info('loaded demos')
            if args.episodes:
                if args.episodes > len(self.train_demos):
                    raise ValueError("there are only {} train demos".format(
                        len(self.train_demos)))
                self.train_demos = self.train_demos[:args.episodes]

            self.val_demos = utils.load_demos(demos_path_valid)
            if args.val_episodes > len(self.val_demos):
                logger.info(
                    'Using all the available {} demos to evaluate valid. accuracy'
                    .format(len(self.val_demos)))
            self.val_demos = self.val_demos[:self.args.val_episodes]

            observation_space = self.env.observation_space
            action_space = self.env.action_space

        self.obss_preprocessor = utils.ObssPreprocessor(
            args.model, observation_space,
            getattr(self.args, 'pretrained_model', None))

        # Define actor-critic model
        self.acmodel = utils.load_model(args.model, raise_not_found=False)
        if self.acmodel is None:
            if getattr(self.args, 'pretrained_model', None):
                logger.info("Loading pretrained model")
                self.acmodel = utils.load_model(args.pretrained_model,
                                                raise_not_found=True)
            else:
                logger.info('Creating new model')
                self.acmodel = ACModel(
                    self.obss_preprocessor.obs_space,
                    action_space,
                    args.image_dim,
                    args.memory_dim,
                    args.instr_dim,
                    not self.args.no_desc,
                    self.args.instr_arch,
                    not self.args.no_mem,
                    self.args.arch,
                    random_shuffled=self.args.random_shuffle,
                    instr_sents=self.env.n_floor_colors,
                    enable_instr=self.args.enable_instr,
                    instr_only=self.args.instr_only)
        self.obss_preprocessor.vocab.save()
        utils.save_model(self.acmodel, args.model)

        self.acmodel.train()
        if torch.cuda.is_available():
            self.acmodel.cuda()

        self.optimizer = torch.optim.Adam(self.acmodel.parameters(),
                                          self.args.lr,
                                          eps=self.args.optim_eps)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,
                                                         step_size=100,
                                                         gamma=0.9)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
Пример #3
0
            "U {} | F {:06} | FPS {:04.0f} | D {} | rR:x̄σmM {: .2f} {: .2f} {: .2f} {: .2f} | F:x̄σmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {: .3f} | vL {:.3f}"
                .format(i, num_frames, fps, duration,
                        *rreturn_per_episode.values(),
                        *num_frames_per_episode.values(),
                        logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"]))
        if args.tb:
            writer.add_scalar("frames", num_frames, i)
            writer.add_scalar("FPS", fps, i)
            writer.add_scalar("duration", total_ellapsed_time, i)
            for key, value in return_per_episode.items():
                writer.add_scalar("return_" + key, value, i)
            for key, value in rreturn_per_episode.items():
                writer.add_scalar("rreturn_" + key, value, i)
            for key, value in num_frames_per_episode.items():
                writer.add_scalar("num_frames_" + key, value, i)
            writer.add_scalar("entropy", logs["entropy"], i)
            writer.add_scalar("value", logs["value"], i)
            writer.add_scalar("policy_loss", logs["policy_loss"], i)
            writer.add_scalar("value_loss", logs["value_loss"], i)

    # Save obss preprocessor vocabulary and model

    if args.save_interval > 0 and i % args.save_interval == 0:
        obss_preprocessor.vocab.save()

        if torch.cuda.is_available():
            acmodel.cpu()
        utils.save_model(acmodel, model_name)
        logger.info("Model is saved.")
        if torch.cuda.is_available():
            acmodel.cuda()
Пример #4
0
        advice_end_index = advice_start_index + env.action_space.n + 1
        acmodel = ACModel(obss_preprocessor.obs_space,
                          envs[0].action_space,
                          envs[0],
                          args.image_dim,
                          args.memory_dim,
                          args.instr_dim,
                          not args.no_instr,
                          args.instr_arch,
                          not args.no_mem,
                          advice_dim=128,
                          advice_start_index=advice_start_index,
                          advice_end_index=advice_end_index)

obss_preprocessor.vocab.save()
utils.save_model(acmodel, args.model)

if torch.cuda.is_available():
    acmodel.cuda()

# Define actor-critic algo

reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward
if args.algo == "ppo":
    algo = PPOAlgo(acmodel, envs, args.frames_per_proc, args.discount, args.lr,
                   args.beta1, args.beta2, args.gae_lambda, args.entropy_coef,
                   args.value_loss_coef, args.max_grad_norm, args.recurrence,
                   args.optim_eps, args.clip_eps, args.ppo_epochs,
                   args.batch_size)
else:
    raise ValueError("Incorrect algorithm name: {}".format(args.algo))
Пример #5
0
    if model is None:
        if pretrained[m]:
            models.append(utils.load_model(pretrained[m],
                                           raise_not_found=True))
        else:
            models.append(
                ACModel(obss_preprocessor.obs_spaces[m], envs[0].action_space,
                        args.image_dim, args.memory_dim, args.instr_dim,
                        args.enc_dim, args.dec_dim, args.len_message,
                        args.num_symbols))
    else:
        models.append(model)

for m, model in enumerate(models):
    obss_preprocessor.vocabs[m].save()
    utils.save_model(model, model_names[m])

if torch.cuda.is_available():
    for model in models:
        model.cuda()

# Define actor--critic algorithm.
reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward
algo = PPOAlgo(penv, models, args.frames_per_proc, args.discount, args.lr,
               args.beta1, args.beta2, args.gae_lambda, args.entropy_coef,
               args.value_loss_coef, args.max_grad_norm, args.recurrence,
               args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size,
               obss_preprocessor, reshape_reward, not args.no_comm,
               args.conventional)

for m, model_name in enumerate(model_names):
Пример #6
0
    def train(self, train_demos, writer, csv_writer, status_path, header, reset_status=False):
        # Load the status
        def initial_status():
            return {'i': 0,
                    'num_frames': 0,
                    'patience': 0}

        status = initial_status()
        if os.path.exists(status_path) and not reset_status:
            with open(status_path, 'r') as src:
                status = json.load(src)
        elif not os.path.exists(os.path.dirname(status_path)):
            # Ensure that the status directory exists
            os.makedirs(os.path.dirname(status_path))

        # If the batch size is larger than the number of demos, we need to lower the batch size
        if self.args.batch_size > len(train_demos):
            self.args.batch_size = len(train_demos)
            logger.info("Batch size too high. Setting it to the number of train demos ({})".format(len(train_demos)))

        # Model saved initially to avoid "Model not found Exception" during first validation step
        utils.save_model(self.acmodel, self.args.model)

        # best mean return to keep track of performance on validation set
        best_success_rate, patience, i = 0, 0, 0
        total_start_time = time.time()

        while status['i'] < getattr(self.args, 'epochs', int(1e9)):
            if 'patience' not in status:  # if for some reason you're finetuining with IL an RL pretrained agent
                status['patience'] = 0
            # Do not learn if using a pre-trained model that already lost patience
            if status['patience'] > self.args.patience:
                break
            if status['num_frames'] > self.args.frames:
                break

            status['i'] += 1
            i = status['i']
            update_start_time = time.time()

            # Learning rate scheduler
            self.scheduler.step()

            log = self.run_epoch_recurrence(train_demos, is_training=True)
            total_len = sum([len(item[3]) for item in train_demos])
            status['num_frames'] += total_len

            update_end_time = time.time()

            # Print logs
            if status['i'] % self.args.log_interval == 0:
                total_ellapsed_time = int(time.time() - total_start_time)

                fps = total_len / (update_end_time - update_start_time)
                duration = datetime.timedelta(seconds=total_ellapsed_time)

                for key in log:
                    log[key] = np.mean(log[key])

                train_data = [status['i'], status['num_frames'], fps, total_ellapsed_time,
                              log["entropy"], log["policy_loss"], log["accuracy"]]

                logger.info(
                    "U {} | F {:06} | FPS {:04.0f} | D {} | H {:.3f} | pL {: .3f} | A {: .3f}".format(*train_data))

                # Log the gathered data only when we don't evaluate the validation metrics. It will be logged anyways
                # afterwards when status['i'] % self.args.val_interval == 0
                if status['i'] % self.args.val_interval != 0:
                    # instantiate a validation_log with empty strings when no validation is done
                    validation_data = [''] * len([key for key in header if 'valid' in key])
                    assert len(header) == len(train_data + validation_data)
                    if self.args.tb:
                        for key, value in zip(header, train_data):
                            writer.add_scalar(key, float(value), status['num_frames'])
                    csv_writer.writerow(train_data + validation_data)

            if status['i'] % self.args.val_interval == 0:

                valid_log = self.validate(self.args.val_episodes)
                mean_return = [np.mean(log['return_per_episode']) for log in valid_log]
                success_rate = [np.mean([1 if r > 0 else 0 for r in log['return_per_episode']]) for log in
                                valid_log]

                val_log = self.run_epoch_recurrence(self.val_demos)
                validation_accuracy = np.mean(val_log["accuracy"])

                if status['i'] % self.args.log_interval == 0:
                    validation_data = [validation_accuracy] + mean_return + success_rate
                    logger.info(("Validation: A {: .3f} " + ("| R {: .3f} " * len(mean_return) +
                                                             "| S {: .3f} " * len(success_rate))
                                 ).format(*validation_data))

                    assert len(header) == len(train_data + validation_data)
                    if self.args.tb:
                        for key, value in zip(header, train_data + validation_data):
                            writer.add_scalar(key, float(value), status['num_frames'])
                    csv_writer.writerow(train_data + validation_data)

                # In case of a multi-env, the update condition would be "better mean success rate" !
                if np.mean(success_rate) > best_success_rate:
                    best_success_rate = np.mean(success_rate)
                    status['patience'] = 0
                    with open(status_path, 'w') as dst:
                        json.dump(status, dst)
                    # Saving the model
                    logger.info("Saving best model")

                    if torch.cuda.is_available():
                        self.acmodel.cpu()
                    utils.save_model(self.acmodel, self.args.model + "_best")
                    self.obss_preprocessor.vocab.save(utils.get_vocab_path(self.args.model + "_best"))
                    if torch.cuda.is_available():
                        self.acmodel.cuda()
                else:
                    status['patience'] += 1
                    logger.info(
                        "Losing patience, new value={}, limit={}".format(status['patience'], self.args.patience))

                if torch.cuda.is_available():
                    self.acmodel.cpu()
                utils.save_model(self.acmodel, self.args.model)
                self.obss_preprocessor.vocab.save()
                if torch.cuda.is_available():
                    self.acmodel.cuda()
                with open(status_path, 'w') as dst:
                    json.dump(status, dst)
Пример #7
0
        acmodel0 = ACModel(obss_preprocessor.obs_space, envs0[0].action_space,
                           args.image_dim, args.memory_dim, args.instr_dim, args.enc_dim, args.dec_dim,
                           not args.no_instr, args.instr_arch, not args.no_mem, args.arch,
                           args.len_message, args.num_symbols)
if acmodel1 is None:
    if args.pretrained_model:
        acmodel1 = utils.load_model(args.pretrained_model, 1, raise_not_found=True)
    else:
        #torch.manual_seed(args.seed)
        acmodel1 = ACModel(obss_preprocessor.obs_space, envs1[0].action_space,
                           args.image_dim, args.memory_dim, args.instr_dim, args.enc_dim, args.dec_dim,
                           not args.no_instr, args.instr_arch, not args.no_mem, args.arch,
                           args.len_message, args.num_symbols)

obss_preprocessor.vocab.save()
utils.save_model(acmodel0, args.model, 0)
utils.save_model(acmodel1, args.model, 1)

if torch.cuda.is_available():
    acmodel0.cuda()
    acmodel1.cuda()

# Define actor-critic algo

reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward
if args.algo == "ppo":
    algo = babyai.rl.PPOAlgo(envs0, envs1, acmodel0, acmodel1, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2,
                              args.gae_lambda,
                              args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                              args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor,
                              reshape_reward)