示例#1
0
    def __init__(self,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_memory=False,
                 use_text=False):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            obs_space)
        self.acmodel = ACModel(obs_space,
                               action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size)

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
示例#2
0
def create_model(obs_space, envs):
    """Helper function to create new model faster."""
    cuda = torch.cuda.is_available()
    device = torch.device("cuda" if cuda else "cpu")
    model = ACModel(obs_space, envs[0].action_space)
    model = model.to(device)
    return model
示例#3
0
    def __init__(self, learning_rate, discount, action_space, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.learning_rate = learning_rate
        self.discount = discount
        self.action = None

        self.ac_model = ACModel(action_space)
        self.ac_model.compile(optimizer=Adam(learning_rate=learning_rate))
示例#4
0
 def __init__(self, talker):
     super(ACBrain, self).__init__()
     self.model = ACModel()
     self.model.build((None, IMG_H, IMG_W, k))
     self.talker = talker
     self.i = 1
     self.optimizer = optim.Adam(learning_rate=CustomSchedule(lr))
     self.states_list = self.talker.states_list
     self.memory = []
     self.one_episode_reward_index = 0
    def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space)
        self.acmodel = ACModel(obs_space, action_space)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
示例#6
0
    def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space)
        self.acmodel = ACModel(obs_space, action_space)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size)

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
示例#7
0
    def __init__(self,
                 env,
                 obs_space,
                 action_space,
                 model_dir,
                 ignoreLTL,
                 progression_mode,
                 gnn,
                 recurrence=1,
                 dumb_ac=False,
                 device=None,
                 argmax=False,
                 num_envs=1):
        try:
            print(model_dir)
            status = utils.get_status(model_dir)
        except OSError:
            status = {"num_frames": 0, "update": 0}

        using_gnn = (gnn != "GRU" and gnn != "LSTM")
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            env, using_gnn, progression_mode)
        if "vocab" in status and self.preprocess_obss.vocab is not None:
            self.preprocess_obss.vocab.load_vocab(status["vocab"])

        if recurrence > 1:
            self.acmodel = RecurrentACModel(env, obs_space, action_space,
                                            ignoreLTL, gnn, dumb_ac, True)
            self.memories = torch.zeros(num_envs,
                                        self.acmodel.memory_size,
                                        device=device)
        else:
            self.acmodel = ACModel(env, obs_space, action_space, ignoreLTL,
                                   gnn, dumb_ac, True)

        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
示例#8
0
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""
    def __init__(self,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_rim=False):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            obs_space)
        self.acmodel = ACModel(obs_space, action_space, use_rim=use_rim)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size).to(device)

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))

    def get_actions(self, obss):
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad():
            if self.acmodel.recurrent:
                dist, _, self.memories = self.acmodel(preprocessed_obss,
                                                      self.memories)
            else:
                dist, _ = self.acmodel(preprocessed_obss)

        if self.argmax:
            actions = dist.probs.max(1, keepdim=True)[1]
        else:
            actions = dist.sample()

        return actions.cpu().numpy()

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(dones, dtype=torch.float).to(
                self.device).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
示例#9
0
    def __init__(self,
                 env,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_memory=False,
                 use_text=False):
        obs_space, self.preprocess_obs_goals = utils.get_obs_goals_preprocessor(
            obs_space)
        self.acmodel = ACModel(obs_space,
                               action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        status = utils.get_status(model_dir)

        self.goals = list(status['agent_goals'].values())
        # for goal in self.goals:
        #     goal = env.unwrapped.get_obs_render( goal, tile_size=32)
        #     plt.imshow(goal)
        #     plt.show()

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size,
                                        device=self.device)

        self.acmodel.load_state_dict(status["model_state"])
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obs_goals, "vocab"):
            self.preprocess_obs_goals.vocab.load_vocab(status["vocab"])
示例#10
0
class ACAgent(AgentBase):
    def __init__(self, learning_rate, discount, action_space, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.learning_rate = learning_rate
        self.discount = discount
        self.action = None

        self.ac_model = ACModel(action_space)
        self.ac_model.compile(optimizer=Adam(learning_rate=learning_rate))

    def before(self, *args, **kwargs):
        pass

    def after(self, *args, **kwargs):
        pass

    def act(self, state) -> int:
        state = tf.convert_to_tensor([state])
        _, probs = self.ac_model(state)

        action_probs = tfp.distributions.Categorical(probs=probs)
        self.action = action_probs.sample()
        return self.action.numpy().item()

    def learn(self, *args, **kwargs):
        state = tf.convert_to_tensor([kwargs['state']], dtype=tf.float32)
        next_state = tf.convert_to_tensor([kwargs['next_state']],
                                          dtype=tf.float32)
        reward = tf.convert_to_tensor([kwargs['reward']], dtype=tf.float32)
        done = kwargs['done']

        with tf.GradientTape(persistent=False) as tape:
            state_val, probs = self.ac_model(state)
            next_state_val, _ = self.ac_model(next_state)

            state_val = tf.squeeze(state_val)
            next_state_val = tf.squeeze(next_state_val)

            action_probs = tfp.distributions.Categorical(probs=probs)
            log_prob = action_probs.log_prob(self.action)

            exp_val = reward + self.discount * next_state_val * (
                1 - int(done)) - state_val

            actor_loss = -log_prob * exp_val
            critic_loss = exp_val**2
            total_loss = actor_loss + critic_loss

        gradient = tape.gradient(total_loss, self.ac_model.trainable_variables)
        self.ac_model.optimizer.apply_gradients(
            zip(gradient, self.ac_model.trainable_variables))

    def save_model(self):
        self.ac_model.save_weights("path/to/file")

    def load_model(self):
        self.ac_model.load_weights("path/to/file")
示例#11
0
def train(model_type, batch_size, sequence_length, frame_shape):
    model = ACModel(model_type, input_shape = (20, 120, 120, 3))
    data = DataSet(sequence_length, frame_shape)

    checkpoint = ModelCheckpoint(filepath = os.path.join('CheckPoints', (model_type + '-.{epoch:03d}-{val_loss:.3f}.hdf5')), verbose = 1, save_best_only = True)
    tensorBoard = TensorBoard(log_dir = os.path.join('CheckPoints', 'logs', model_type))

    if 'parallel' not in model_type:
        tri_generator = data.generator('train', 'fn', batch_size)
        val_generator = data.generator('test', 'fn', batch_size)
    else:
        tri_generator = data.parallel_generator('train', batch_size)
        val_generator = data.parallel_generator('test', batch_size)

    model.model.fit_generator(generator = tri_generator, 
                              steps_per_epoch = data.size('train') // batch_size, 
                              epochs = epochs,
                              verbose = 1,
                              callbacks = [tensorBoard, checkpoint],
                              validation_data = val_generator, 
                              validation_steps = 4, 
                              workers = 1)
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""

    def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space)
        self.acmodel = ACModel(obs_space, action_space)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()

    def get_actions(self, obss):
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad():
            dist, _ = self.acmodel(preprocessed_obss)

        if self.argmax:
            actions = dist.probs.max(1, keepdim=True)[1]
        else:
            actions = dist.sample()

        return actions.cpu().numpy()

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        pass

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
示例#13
0
    def __init__(self,
                 env,
                 model_dir,
                 model_type='PPO2',
                 logger=None,
                 argmax=False,
                 use_memory=False,
                 use_text=False,
                 num_cpu=1,
                 frames_per_proc=None,
                 discount=0.99,
                 lr=0.001,
                 gae_lambda=0.95,
                 entropy_coef=0.01,
                 value_loss_coef=0.5,
                 max_grad_norm=0.5,
                 recurrence=1,
                 optim_eps=1e-8,
                 optim_alpha=None,
                 clip_eps=0.2,
                 epochs=4,
                 batch_size=256):
        """
        Initialize the Agent object.

        This primarily includes storing of the configuration parameters, but there is some other logic for correctly
        initializing the agent.

        :param env: the environment for training
        :param model_dir: the save directory (appended with the goal_id in initialization)
        :param model_type: the type of model {'PPO2', 'A2C'}
        :param logger: existing text logger
        :param argmax: if we use determinsitic or probabilistic action selection
        :param use_memory: if we are using an LSTM
        :param use_text: if we are using NLP to parse the goal
        :param num_cpu: the number of parallel instances for training
        :param frames_per_proc: max time_steps per process (versus constant)
        :param discount: the discount factor (gamma)
        :param lr: the learning rate
        :param gae_lambda: the generalized advantage estimator lambda parameter (training smoothing parameter)
        :param entropy_coef: relative weight for entropy loss
        :param value_loss_coef: relative weight for value function loss
        :param max_grad_norm: max scaling factor for the gradient
        :param recurrence: number of recurrent steps
        :param optim_eps: minimum value to prevent numerical instability
        :param optim_alpha: RMSprop decay parameter (A2C only)
        :param clip_eps: clipping parameter for the advantage and value function (PPO2 only)
        :param epochs: number of epochs in the parameter update (PPO2 only)
        :param batch_size: number of samples for the parameter update (PPO2 only)
        """
        if hasattr(
                env, 'goal'
        ) and env.goal:  # if the environment has a goal, set the model_dir to the goal folder
            self.model_dir = model_dir + env.goal.goalId + '/'
        else:  # otherwise just use the model_dir as is
            self.model_dir = model_dir

        # store all of the input parameters
        self.model_type = model_type
        self.num_cpu = num_cpu
        self.frames_per_proc = frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.optim_eps = optim_eps
        self.optim_alpha = optim_alpha
        self.clip_eps = clip_eps
        self.epochs = epochs
        self.batch_size = batch_size

        # use the existing logger and create two new ones
        self.txt_logger = logger
        self.csv_file, self.csv_logger = utils.get_csv_logger(self.model_dir)
        self.tb_writer = tensorboardX.SummaryWriter(self.model_dir)

        self.set_env(
            env
        )  # set the environment to with some additional checks and init of training_envs

        self.algo = None  # we don't initialize the algorithm until we call init_training_algo()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.txt_logger.info(f"Device: {device}\n")

        try:  # if we have a saved model, load it
            self.status = utils.get_status(self.model_dir)
        except OSError:  # otherwise initialize the status
            print('error loading saved model.  initializing empty model...')
            self.status = {"num_frames": 0, "update": 0}
        if self.txt_logger: self.txt_logger.info("Training status loaded\n")

        if "vocab" in self.status:
            preprocess_obss.vocab.load_vocab(self.status["vocab"])
        if self.txt_logger:
            self.txt_logger.info("Observations preprocessor loaded")

        # get the obs_space and the observation pre-processor
        # (for manipulating gym observations into a torch-friendly format)
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            self.env.observation_space)
        self.acmodel = ACModel(obs_space,
                               self.env.action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device  # store the device {'cpu', 'cuda:N'}
        self.argmax = argmax  # if we are using greedy action selection
        # or are we using probabilistic action selection

        if self.acmodel.recurrent:  # initialize the memories
            self.memories = torch.zeros(num_cpu,
                                        self.acmodel.memory_size,
                                        device=self.device)

        if "model_state" in self.status:  # if we have a saved model ('model_state') in the status
            # load that into the initialized model
            self.acmodel.load_state_dict(self.status["model_state"])
        self.acmodel.to(
            device)  # make sure the model is located on the correct device
        self.txt_logger.info("Model loaded\n")
        self.txt_logger.info("{}\n".format(self.acmodel))

        # some redundant code.  uncomment if there are issues and delete after enough testing
        #if 'model_state' in self.status:
        #    self.acmodel.load_state_dict(self.status['model_state'])
        #self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
示例#14
0
    status = utils.get_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}
txt_logger.info("Training status loaded\n")

# Load observations preprocessor

obs_space, preprocess_obss = utils.get_obss_preprocessor(
    envs[0].observation_space)  # TODO
if "vocab" in status:
    preprocess_obss.vocab.load_vocab(status["vocab"])
txt_logger.info("Observations preprocessor loaded")

# Load model

acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
acmodel.to(device)
txt_logger.info("Model loaded\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo

if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                            args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef,
                            args.max_grad_norm, args.recurrence,
                            args.optim_alpha, args.optim_eps, preprocess_obss)
elif args.algo == "ppo":
示例#15
0
def main():
    # Parse arguments

    parser = argparse.ArgumentParser()

    ## General parameters
    parser.add_argument(
        "--algo",
        required=True,
        help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)")
    parser.add_argument("--env",
                        required=True,
                        help="name of the environment to train on (REQUIRED)")
    parser.add_argument(
        "--model",
        default=None,
        help="name of the model (default: {ENV}_{ALGO}_{TIME})")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval",
                        type=int,
                        default=1,
                        help="number of updates between two logs (default: 1)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=10,
        help=
        "number of updates between two saves (default: 10, 0 means no saving)")
    parser.add_argument("--procs",
                        type=int,
                        default=16,
                        help="number of processes (default: 16)")
    parser.add_argument("--frames",
                        type=int,
                        default=10**7,
                        help="number of frames of training (default: 1e7)")

    ## Parameters for main algorithm
    parser.add_argument("--epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="batch size for PPO (default: 256)")
    parser.add_argument(
        "--frames-per-proc",
        type=int,
        default=None,
        help=
        "number of frames per process before update (default: 5 for A2C and 128 for PPO)"
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="learning rate (default: 0.001)")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.95,
        help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)"
    )
    parser.add_argument("--entropy-coef",
                        type=float,
                        default=0.01,
                        help="entropy term coefficient (default: 0.01)")
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument(
        "--optim-eps",
        type=float,
        default=1e-8,
        help="Adam and RMSprop optimizer epsilon (default: 1e-8)")
    parser.add_argument("--optim-alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha (default: 0.99)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument(
        "--recurrence",
        type=int,
        default=1,
        help=
        "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory."
    )
    parser.add_argument("--text",
                        action="store_true",
                        default=False,
                        help="add a GRU to the model to handle text input")
    parser.add_argument("--visualize",
                        default=False,
                        help="show real time CNN layer weight changes")

    args = parser.parse_args()

    args.mem = args.recurrence > 1

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"

    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps,
                                preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs,
                                args.batch_size, preprocess_obss)

    elif args.algo == "ppo_intrinsic":
        algo = torch_ac.PPOAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)
    elif args.algo == "a2c_intrinsic":
        algo = torch_ac.A2CAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_alpha,
            args.optim_eps, preprocess_obss)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    print_visual = args.visualize
    if print_visual:
        fig, axs = plt.subplots(1, 3)
        fig.suptitle('Convolution Layer Weights Normalized Difference')

    while num_frames < args.frames:

        # Store copies of s_t model params
        old_parameters = {}
        for name, param in acmodel.named_parameters():
            old_parameters[name] = param.detach().numpy().copy()

        # Update model parameters
        update_start_time = time.time()
        exps, logs1 = algo.collect_experiences()
        logs2 = algo.update_parameters(exps)
        logs = {**logs1, **logs2}
        update_end_time = time.time()

        # Store copies of s_t+1 model params
        new_parameters = {}
        for name, param in acmodel.named_parameters():
            new_parameters[name] = param.detach().numpy().copy()

        # Compute L2 Norm of model state differences
        # Print model weight change visualization
        for index in range(len(old_parameters.keys())):
            if index == 0 or index == 2 or index == 4:
                key = list(old_parameters.keys())[index]
                old_weights = old_parameters[key]
                new_weights = new_parameters[key]
                norm_diff = numpy.linalg.norm(new_weights - old_weights)
                diff_matrix = abs(new_weights - old_weights)
                diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0],
                                                    norm='max',
                                                    axis=0)
                if print_visual:
                    axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0],
                                               cmap='Greens',
                                               interpolation='nearest')

        # This allows the plots to update as the model trains
        if print_visual:
            plt.ion()
            plt.show()
            plt.pause(0.001)

        num_frames += logs["num_frames"]
        update += 1

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value", "policy_loss", "value_loss", "grad_norm"
            ]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"], logs["grad_norm"]
            ]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {
                "num_frames": num_frames,
                "update": update,
                "model_state": acmodel.state_dict(),
                "optimizer_state": algo.optimizer.state_dict()
            }
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
def main(raw_args=None):

    # Parse arguments
    parser = argparse.ArgumentParser()

    ## General parameters
    parser.add_argument("--algo",
                        required=True,
                        help="algorithm to use: a2c | ppo | ipo (REQUIRED)")
    parser.add_argument("--domain1",
                        required=True,
                        help="name of the first domain to train on (REQUIRED)")
    parser.add_argument(
        "--domain2",
        required=True,
        help="name of the second domain to train on (REQUIRED)")
    parser.add_argument(
        "--p1",
        required=True,
        type=float,
        help="Proportion of training environments from first domain (REQUIRED)"
    )
    parser.add_argument("--model", required=True, help="name of the model")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval",
                        type=int,
                        default=1,
                        help="number of updates between two logs (default: 1)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=10,
        help=
        "number of updates between two saves (default: 10, 0 means no saving)")
    parser.add_argument("--procs",
                        type=int,
                        default=16,
                        help="number of processes (default: 16)")
    parser.add_argument("--frames",
                        type=int,
                        default=10**7,
                        help="number of frames of training (default: 1e7)")

    ## Parameters for main algorithm
    parser.add_argument("--epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="batch size for PPO (default: 256)")
    parser.add_argument(
        "--frames-per-proc",
        type=int,
        default=None,
        help=
        "number of frames per process before update (default: 5 for A2C and 128 for PPO)"
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="learning rate (default: 0.001)")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.95,
        help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)"
    )
    parser.add_argument("--entropy-coef",
                        type=float,
                        default=0.01,
                        help="entropy term coefficient (default: 0.01)")
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument(
        "--optim-eps",
        type=float,
        default=1e-8,
        help="Adam and RMSprop optimizer epsilon (default: 1e-8)")
    parser.add_argument("--optim-alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha (default: 0.99)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument(
        "--recurrence",
        type=int,
        default=1,
        help=
        "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory."
    )
    parser.add_argument("--text",
                        action="store_true",
                        default=False,
                        help="add a GRU to the model to handle text input")

    args = parser.parse_args(raw_args)

    args.mem = args.recurrence > 1

    # Check PyTorch version
    if (torch.__version__ != '1.2.0'):
        raise ValueError(
            "PyTorch version must be 1.2.0 (see README). Your version is {}.".
            format(torch.__version__))

    if args.mem:
        raise ValueError("Policies with memory not supported.")

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = args.model

    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    torch.backends.cudnn.deterministic = True
    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")

    # Load environments from different domains
    domain1 = args.domain1  # e.g., 'MiniGrid-ColoredKeysRed-v0'
    domain2 = args.domain2  # e.g., 'MiniGrid-ColoredKeysYellow-v0'

    p1 = args.p1  # Proportion of environments from domain1

    num_envs_total = args.procs  # Total number of environments
    num_domain1 = math.ceil(
        p1 * num_envs_total)  # Number of environments in domain1
    num_domain2 = num_envs_total - num_domain1  # Number of environments in domain2

    # Environments from domain1
    envs1 = []
    for i in range(num_domain1):
        envs1.append(utils.make_env(domain1, args.seed + 10000 * i))

    # Environments from domain2
    envs2 = []
    for i in range(num_domain2):
        envs2.append(utils.make_env(domain2, args.seed + 10000 * i))

    # All environments
    envs = envs1 + envs2

    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    if args.algo == "ipo":
        # Load model for IPO game
        acmodel = ACModel_average(obs_space, envs[0].action_space, args.mem,
                                  args.text)
        if "model_state" in status:
            acmodel.load_state_dict(status["model_state"])
        acmodel.to(device)
        txt_logger.info("Model loaded\n")
        txt_logger.info("{}\n".format(acmodel))

    else:
        # Load model (for standard PPO or A2C)
        acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
        if "model_state" in status:
            acmodel.load_state_dict(status["model_state"])
        acmodel.to(device)
        txt_logger.info("Model loaded\n")
        txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps,
                                preprocess_obss)
        if "optimizer_state" in status:
            algo.optimizer.load_state_dict(status["optimizer_state"])
            txt_logger.info("Optimizer loaded\n")

    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs,
                                args.batch_size, preprocess_obss)

        if "optimizer_state" in status:
            algo.optimizer.load_state_dict(status["optimizer_state"])
            txt_logger.info("Optimizer loaded\n")

    elif args.algo == "ipo":
        # One algo per domain. These have different envivonments, but shared acmodel
        algo1 = torch_ac.IPOAlgo(
            envs1, acmodel, 1, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)

        algo2 = torch_ac.IPOAlgo(
            envs2, acmodel, 2, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)

        if "optimizer_state1" in status:
            algo1.optimizer.load_state_dict(status["optimizer_state1"])
            txt_logger.info("Optimizer 1 loaded\n")
        if "optimizer_state2" in status:
            algo2.optimizer.load_state_dict(status["optimizer_state2"])
            txt_logger.info("Optimizer 2 loaded\n")

    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        update_start_time = time.time()

        if args.algo == "ipo":

            # Standard method

            # Collect experiences on first domain
            exps1, logs_exps1 = algo1.collect_experiences()

            # Update params of model corresponding to first domain
            logs_algo1 = algo1.update_parameters(exps1)

            # Collect experiences on second domain
            exps2, logs_exps2 = algo2.collect_experiences()

            # Update params of model corresponding to second domain
            logs_algo2 = algo2.update_parameters(exps2)

            # Update end time
            update_end_time = time.time()

            # Combine logs
            logs_exps = {
                'return_per_episode':
                logs_exps1["return_per_episode"] +
                logs_exps2["return_per_episode"],
                'reshaped_return_per_episode':
                logs_exps1["reshaped_return_per_episode"] +
                logs_exps2["reshaped_return_per_episode"],
                'num_frames_per_episode':
                logs_exps1["num_frames_per_episode"] +
                logs_exps2["num_frames_per_episode"],
                'num_frames':
                logs_exps1["num_frames"] + logs_exps2["num_frames"]
            }

            logs_algo = {
                'entropy':
                (num_domain1 * logs_algo1["entropy"] +
                 num_domain2 * logs_algo2["entropy"]) / num_envs_total,
                'value': (num_domain1 * logs_algo1["value"] +
                          num_domain2 * logs_algo2["value"]) / num_envs_total,
                'policy_loss':
                (num_domain1 * logs_algo1["policy_loss"] +
                 num_domain2 * logs_algo2["policy_loss"]) / num_envs_total,
                'value_loss':
                (num_domain1 * logs_algo1["value_loss"] +
                 num_domain2 * logs_algo2["value_loss"]) / num_envs_total,
                'grad_norm':
                (num_domain1 * logs_algo1["grad_norm"] +
                 num_domain2 * logs_algo2["grad_norm"]) / num_envs_total
            }

            logs = {**logs_exps, **logs_algo}
            num_frames += logs["num_frames"]

        else:
            exps, logs1 = algo.collect_experiences()
            logs2 = algo.update_parameters(exps)
            logs = {**logs1, **logs2}
            update_end_time = time.time()
            num_frames += logs["num_frames"]

        update += 1

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value", "policy_loss", "value_loss", "grad_norm"
            ]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"], logs["grad_norm"]
            ]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            # header += ["debug_last_env_reward"]
            # data += [logs["debug_last_env_reward"]]

            header += ["total_loss"]
            data += [
                logs["policy_loss"] - args.entropy_coef * logs["entropy"] +
                args.value_loss_coef * logs["value_loss"]
            ]

            if status["num_frames"] == 0:
                csv_logger.writerow(header)

            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:

            if args.algo == "ipo":
                status = {
                    "num_frames": num_frames,
                    "update": update,
                    "model_state": acmodel.state_dict(),
                    "optimizer_state1": algo1.optimizer.state_dict(),
                    "optimizer_state2": algo2.optimizer.state_dict()
                }
            else:
                status = {
                    "num_frames": num_frames,
                    "update": update,
                    "model_state": acmodel.state_dict(),
                    "optimizer_state": algo.optimizer.state_dict()
                }

            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
示例#17
0
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""
    def __init__(self,
                 env,
                 obs_space,
                 action_space,
                 model_dir,
                 ignoreLTL,
                 progression_mode,
                 gnn,
                 recurrence=1,
                 dumb_ac=False,
                 device=None,
                 argmax=False,
                 num_envs=1):
        try:
            print(model_dir)
            status = utils.get_status(model_dir)
        except OSError:
            status = {"num_frames": 0, "update": 0}

        using_gnn = (gnn != "GRU" and gnn != "LSTM")
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            env, using_gnn, progression_mode)
        if "vocab" in status and self.preprocess_obss.vocab is not None:
            self.preprocess_obss.vocab.load_vocab(status["vocab"])

        if recurrence > 1:
            self.acmodel = RecurrentACModel(env, obs_space, action_space,
                                            ignoreLTL, gnn, dumb_ac, True)
            self.memories = torch.zeros(num_envs,
                                        self.acmodel.memory_size,
                                        device=device)
        else:
            self.acmodel = ACModel(env, obs_space, action_space, ignoreLTL,
                                   gnn, dumb_ac, True)

        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()

    def get_actions(self, obss):
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad():
            if self.acmodel.recurrent:
                dist, _, self.memories = self.acmodel(preprocessed_obss,
                                                      self.memories)
            else:
                dist, _ = self.acmodel(preprocessed_obss)

        if self.argmax:
            actions = dist.probs.max(1, keepdim=True)[1]
        else:
            actions = dist.sample()

        return actions.cpu().numpy()

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(dones, dtype=torch.float).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
示例#18
0
    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
    txt_logger.info("Observations preprocessor loaded")

    # Load model

    acmodel = ACModel(envs[0].observation_space, envs[0].action_space, memory, False)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if algorithm == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, 5, discount, lr, gae_lambda,
                                entropy_coef, value_loss_coef, max_grad_norm, recurrence,
                                optim_alpha, optim_eps, preprocess_obss)
    elif algorithm == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, 128, discount, lr, gae_lambda,
                                entropy_coef, value_loss_coef, max_grad_norm, recurrence,
示例#19
0
try:
    status = utils.load_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}

# Define actor-critic model

try:
    acmodel = utils.load_model(model_dir)
    logger.info("Model successfully loaded\n")
except OSError:
    acmodel = ACModel(obs_space,
                      envs[0].action_space,
                      args.model_type,
                      use_bottleneck=args.use_bottleneck,
                      dropout=args.use_dropout,
                      use_l2a=args.use_l2a,
                      use_bn=args.use_bn,
                      sni_type=args.sni_type)

    logger.info("Model successfully created\n")
logger.info("{}\n".format(acmodel))

if torch.cuda.is_available():
    acmodel.cuda()
logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

# Define actor-critic algo

# a2c does not yet support the bottleneck
assert args.algo == "ppo"
try:
    status = utils.get_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}
txt_logger.info("Training status loaded\n")

# Load observations preprocessor

obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
if "vocab" in status:
    preprocess_obss.vocab.load_vocab(status["vocab"])
txt_logger.info("Observations preprocessor loaded")

# Load model

acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text, args.use_rim)
if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
acmodel.to(device)
txt_logger.info("Model loaded\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo

if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_alpha, args.optim_eps, preprocess_obss)
elif args.algo == "ppo":
    algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
示例#21
0
    status = utils.load_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}

# Define actor-critic model

try:
    acmodel = utils.load_model(model_dir)
    logger.info("Model successfully loaded\n")
except OSError:
    acmodel = ACModel(obs_space,
                      envs[0].action_space,
                      args.model_type,
                      use_bottleneck=args.use_bottleneck,
                      dropout=args.use_dropout,
                      use_l2a=args.use_l2a,
                      use_bn=args.use_bn,
                      sni_type=args.sni_type,
                      flow=args.flow,
                      n_flows=args.n_flows,
                      num_latent_channels=args.num_latent_channels)

    logger.info("Model successfully created\n")
logger.info("{}\n".format(acmodel))

if torch.cuda.is_available():
    acmodel.cuda()
logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

# Define actor-critic algo
示例#22
0
try:
    status = utils.load_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}

# Define actor-critic model

try:
    base_model = utils.load_model(model_dir)
    logger.info("Model successfully loaded\n")
except OSError:
    if args.algo == "dqn":
        base_model = DQNModel(obs_space, envs[0].action_space, args.mem,
                              args.text)
    else:
        base_model = ACModel(obs_space, envs[0].action_space, args.mem,
                             args.text)
    logger.info("Model successfully created\n")
logger.info("{}\n".format(base_model))

if torch.cuda.is_available():
    base_model.cuda()
logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

# Train model

num_frames = status["num_frames"]
total_start_time = time.time()
update = status["update"]
best_val = 0

if args.algo == "a2c":
示例#23
0
    env = gym.make(args.env)
    env.seed(args.seed + 10000 * i)
    envs.append(env)

# Define obss preprocessor

preprocess_obss = utils.ObssPreprocessor(save_dir, envs[0].observation_space)

# Define actor-critic model

if utils.model_exists(save_dir):
    acmodel = utils.load_model(save_dir)
    status = utils.load_status(save_dir)
    logger.info("Model successfully loaded\n")
else:
    acmodel = ACModel(preprocess_obss.obs_space, envs[0].action_space,
                      not args.no_instr, not args.no_mem)
    status = {"num_frames": 0, "update": 0}
    logger.info("Model successfully created\n")
logger.info("{}\n".format(acmodel))

if torch.cuda.is_available():
    acmodel.cuda()
logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

# Define actor-critic algo

if args.algo == "a2c":
    algo = torch_rl.A2CAlgo(envs, acmodel, args.frames_per_proc, args.discount,
                            args.lr, args.gae_lambda, args.entropy_coef,
                            args.value_loss_coef, args.max_grad_norm,
                            args.recurrence, args.optim_alpha, args.optim_eps,
示例#24
0
obs_space, preprocess_obss = utils.get_obss_preprocessor(args.env, envs[0].observation_space, model_dir)

# Load training status

try:
    status = utils.load_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}

# Define actor-critic model

try:
    acmodel = utils.load_model(model_dir)
    logger.info("Model successfully loaded\n")
except OSError:
    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    logger.info("Model successfully created\n")
logger.info("{}\n".format(acmodel))

if torch.cuda.is_available():
    acmodel.cuda()
logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

# Define actor-critic algo

if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_alpha, args.optim_eps, preprocess_obss)
elif args.algo == "ppo":
    algo = torch_ac.PPOAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
示例#25
0
class Agent:
    def __init__(self,
                 env,
                 model_dir,
                 model_type='PPO2',
                 logger=None,
                 argmax=False,
                 use_memory=False,
                 use_text=False,
                 num_cpu=1,
                 frames_per_proc=None,
                 discount=0.99,
                 lr=0.001,
                 gae_lambda=0.95,
                 entropy_coef=0.01,
                 value_loss_coef=0.5,
                 max_grad_norm=0.5,
                 recurrence=1,
                 optim_eps=1e-8,
                 optim_alpha=None,
                 clip_eps=0.2,
                 epochs=4,
                 batch_size=256):
        """
        Initialize the Agent object.

        This primarily includes storing of the configuration parameters, but there is some other logic for correctly
        initializing the agent.

        :param env: the environment for training
        :param model_dir: the save directory (appended with the goal_id in initialization)
        :param model_type: the type of model {'PPO2', 'A2C'}
        :param logger: existing text logger
        :param argmax: if we use determinsitic or probabilistic action selection
        :param use_memory: if we are using an LSTM
        :param use_text: if we are using NLP to parse the goal
        :param num_cpu: the number of parallel instances for training
        :param frames_per_proc: max time_steps per process (versus constant)
        :param discount: the discount factor (gamma)
        :param lr: the learning rate
        :param gae_lambda: the generalized advantage estimator lambda parameter (training smoothing parameter)
        :param entropy_coef: relative weight for entropy loss
        :param value_loss_coef: relative weight for value function loss
        :param max_grad_norm: max scaling factor for the gradient
        :param recurrence: number of recurrent steps
        :param optim_eps: minimum value to prevent numerical instability
        :param optim_alpha: RMSprop decay parameter (A2C only)
        :param clip_eps: clipping parameter for the advantage and value function (PPO2 only)
        :param epochs: number of epochs in the parameter update (PPO2 only)
        :param batch_size: number of samples for the parameter update (PPO2 only)
        """
        if hasattr(
                env, 'goal'
        ) and env.goal:  # if the environment has a goal, set the model_dir to the goal folder
            self.model_dir = model_dir + env.goal.goalId + '/'
        else:  # otherwise just use the model_dir as is
            self.model_dir = model_dir

        # store all of the input parameters
        self.model_type = model_type
        self.num_cpu = num_cpu
        self.frames_per_proc = frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.optim_eps = optim_eps
        self.optim_alpha = optim_alpha
        self.clip_eps = clip_eps
        self.epochs = epochs
        self.batch_size = batch_size

        # use the existing logger and create two new ones
        self.txt_logger = logger
        self.csv_file, self.csv_logger = utils.get_csv_logger(self.model_dir)
        self.tb_writer = tensorboardX.SummaryWriter(self.model_dir)

        self.set_env(
            env
        )  # set the environment to with some additional checks and init of training_envs

        self.algo = None  # we don't initialize the algorithm until we call init_training_algo()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.txt_logger.info(f"Device: {device}\n")

        try:  # if we have a saved model, load it
            self.status = utils.get_status(self.model_dir)
        except OSError:  # otherwise initialize the status
            print('error loading saved model.  initializing empty model...')
            self.status = {"num_frames": 0, "update": 0}
        if self.txt_logger: self.txt_logger.info("Training status loaded\n")

        if "vocab" in self.status:
            preprocess_obss.vocab.load_vocab(self.status["vocab"])
        if self.txt_logger:
            self.txt_logger.info("Observations preprocessor loaded")

        # get the obs_space and the observation pre-processor
        # (for manipulating gym observations into a torch-friendly format)
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            self.env.observation_space)
        self.acmodel = ACModel(obs_space,
                               self.env.action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device  # store the device {'cpu', 'cuda:N'}
        self.argmax = argmax  # if we are using greedy action selection
        # or are we using probabilistic action selection

        if self.acmodel.recurrent:  # initialize the memories
            self.memories = torch.zeros(num_cpu,
                                        self.acmodel.memory_size,
                                        device=self.device)

        if "model_state" in self.status:  # if we have a saved model ('model_state') in the status
            # load that into the initialized model
            self.acmodel.load_state_dict(self.status["model_state"])
        self.acmodel.to(
            device)  # make sure the model is located on the correct device
        self.txt_logger.info("Model loaded\n")
        self.txt_logger.info("{}\n".format(self.acmodel))

        # some redundant code.  uncomment if there are issues and delete after enough testing
        #if 'model_state' in self.status:
        #    self.acmodel.load_state_dict(self.status['model_state'])
        #self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))

    def init_training_algo(self, num_envs=None):
        """
        Initialize the training algorithm.

        This primarily calls the object creation functions for the A2C or PPO2 and the optimizer, but this also spawns
        a number of parallel environments, based on the self.num_cpu or num_envs input (if provided).

        Note, the spawning of parallel environments is VERY slow due to deepcopying the termination sets.  I tried some
        work arounds, but nothing worked properly, so we are stuck with it for now.

        :param num_envs: an override for the default number of environments to spawn (in self.num_cpu)
        """
        if not num_envs:
            num_envs = self.num_cpu

        if self.model_type == "A2C":
            # check to make sure that the A2C parameters are set
            assert self.optim_alpha
            self.training_envs = [deepcopy(self.env) for i in range(num_envs)
                                  ]  # spawn parallel environments

            if self.acmodel.recurrent:
                self.memories = torch.zeros(num_envs,
                                            self.acmodel.memory_size,
                                            device=self.device)

            self.algo = torch_ac.A2CAlgo(
                self.training_envs, self.acmodel, self.device,
                self.frames_per_proc, self.discount, self.lr, self.gae_lambda,
                self.entropy_coef, self.value_loss_coef, self.max_grad_norm,
                self.recurrence, self.optim_alpha, self.optim_eps,
                self.preprocess_obss)
        elif self.model_type == "PPO2":
            # check to see if the PPO2 parameters are set
            assert self.clip_eps and self.epochs and self.batch_size
            self.training_envs = [deepcopy(self.env) for i in range(num_envs)
                                  ]  # spawn parallel environments

            if self.acmodel.recurrent:
                self.memories = torch.zeros(num_envs,
                                            self.acmodel.memory_size,
                                            device=self.device)

            self.algo = torch_ac.PPOAlgo(
                self.training_envs, self.acmodel, self.device,
                self.frames_per_proc, self.discount, self.lr, self.gae_lambda,
                self.entropy_coef, self.value_loss_coef, self.max_grad_norm,
                self.recurrence, self.optim_eps, self.clip_eps, self.epochs,
                self.batch_size, self.preprocess_obss)
        else:
            raise ValueError("Incorrect algorithm name: {}".format(algo_type))

        # load the optimizer state, if it exists
        if "optimizer_state" in self.status:
            self.algo.optimizer.load_state_dict(self.status["optimizer_state"])
        self.txt_logger.info("Optimizer loaded\n")

    def learn(self,
              total_timesteps,
              log_interval=1,
              save_interval=10,
              save_env_info=False,
              save_loc=None):
        """
        The primary training loop.

        :param total_timesteps: the total number of timesteps
        :param log_interval: the period between logging/printing updates
        :param save_interval: the number of updates between model saving
        :param save_env_info: if we save the environment info (termination set) VERY SLOW
        :return: True, if training is successful
        """
        self.init_training_algo(
        )  # initialize the training algo/environment list/optimizer

        if save_loc:
            print(
                'ignoring save_loc override.  if this is not intended, fix me')

        # initialize parameters
        self.num_frames = self.status["num_frames"]
        self.update = self.status["update"]
        start_time = time.time()

        # loop until we reach the desired number of timesteps
        while self.num_frames < total_timesteps:
            # Update model parameters

            update_start_time = time.time(
            )  # store the time (for fps calculations)
            exps, logs1 = self.algo.collect_experiences(
            )  # collect a number of data points for training
            logs2 = self.algo.update_parameters(
                exps)  # update the parameters based on the experiences
            logs = {**logs1, **logs2}  # merge the logs for printing
            update_end_time = time.time()

            self.num_frames += logs["num_frames"]
            self.update += 1

            # all of this messy stuff is just storing and printing the log info

            if self.update % log_interval == 0:
                fps = logs["num_frames"] / (update_end_time -
                                            update_start_time)
                duration = int(time.time() - start_time)
                return_per_episode = utils.synthesize(
                    logs["return_per_episode"])
                rreturn_per_episode = utils.synthesize(
                    logs["reshaped_return_per_episode"])
                num_frames_per_episode = utils.synthesize(
                    logs["num_frames_per_episode"])

                header = ["update", "frames", "FPS", "duration"]
                data = [self.update, self.num_frames, fps, duration]
                header += [
                    "rreturn_" + key for key in rreturn_per_episode.keys()
                ]
                data += rreturn_per_episode.values()
                header += [
                    "num_frames_" + key
                    for key in num_frames_per_episode.keys()
                ]
                data += num_frames_per_episode.values()
                header += [
                    "entropy", "value", "policy_loss", "value_loss",
                    "grad_norm"
                ]
                data += [
                    logs["entropy"], logs["value"], logs["policy_loss"],
                    logs["value_loss"], logs["grad_norm"]
                ]

                self.txt_logger.info(
                    "U {} | F {:06} | FPS {:04.0f} | D {} | rR:usmM {:.2f} {:.2f} {:.2f} {:.2f} | F:usmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | D {:.3f}"
                    .format(*data))

                header += [
                    "return_" + key for key in return_per_episode.keys()
                ]
                data += return_per_episode.values()

                if self.status["num_frames"] == 0:
                    self.csv_logger.writerow(header)
                self.csv_logger.writerow(data)
                self.csv_file.flush()

                for field, value in zip(header, data):
                    self.tb_writer.add_scalar(field, value, self.num_frames)

            # Save status

            if save_interval > 0 and self.update % save_interval == 0:
                self._save_training_info()
                if save_env_info:
                    for e in self.training_envs:
                        if hasattr(e, 'save_env_info'): e.save_env_info()

        self._clear_training_envs()

        return True

    def _save_training_info(self):
        """
        Function to save the training info.
        """

        # update the status dictionary
        self.status = {
            "num_frames": self.num_frames,
            "update": self.update,
            "model_state": self.acmodel.state_dict(),
            "optimizer_state": self.algo.optimizer.state_dict()
        }

        if hasattr(self.preprocess_obss,
                   "vocab"):  # if we are using NLP save, NLP info
            self.status["vocab"] = self.preprocess_obss.vocab.vocab

        utils.save_status(self.status,
                          self.model_dir)  # save the status info to model_dir
        self.txt_logger.info("Status saved")

    def _clear_training_envs(self):
        """
        Clear the training environments to free up memory.
        """

        # the termination set gets lost, so we need to store it again
        if hasattr(self.env, 'termination_set'):
            self.env.termination_set = [
                s for e in self.training_envs for s in e.termination_set
            ]

        # clear the env and the training envs
        self.algo.env = None
        self.training_envs = None

    def save(self, f):
        """
        Legacy function for saving the model.

        TODO: place the saving logic for the model here
        :param f:
        """
        print('self.save() - currently not implemented')

    def set_env(self, env):
        """
        Set the environment and clear the training environments

        :param env: environment for training/acting
        """
        # check to make sure the environment is the correct type
        assert isinstance(env, gym.Env)
        self.env = env
        self.training_envs = None

    def predict(self, obs, state=None, deterministic=False):
        """
        Wrapper for training code compatibility.  Calls get_action() to predict the action to take based on the
        current observation.

        :param obs: observation for predicting the action
        :param state: state of the LSTM (unused)
        :param deterministic: whether to use deterministic or probabilistic actions (unused)
        :return: action and LSTM state
        """
        # assert (state==None) and (deterministic==False) # still need to reimplement
        return self.get_action(
            obs
        ), None  # return action, states - states is unused at the moment

    def get_actions(self, obss):
        """
        Get a list of actions for a list of observations.



        :param obss: list of observations for predicting actions
        :return: list of actions for the associated observations
        """
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad(
        ):  # don't calculate the gradients, since we are doing a forward pass
            if self.acmodel.recurrent:  # if we are using a recurrent model
                dist, _, self.memories = self.acmodel(preprocessed_obss,
                                                      self.memories)
            else:  # otherwise
                dist, _ = self.acmodel(preprocessed_obss)
                # preprocess the observations to put them in a torch-friendly format

        # the acmodel returns a probability distribution
        if self.argmax:  # if we are detemrinistic, take the action with the highest probability
            actions = dist.probs.max(1, keepdim=True)[1]
        else:  # otherwise sample the distribution to select the action
            actions = dist.sample()

        return actions.cpu().numpy()  # reaturn a numpy array, not a tensor

    def get_action(self, obs):
        """
        Wrapper for get_actions() to produce just a single action (rather than a list of actions) for acting.

        :param obs: single observation
        :return: single action
        """
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        """
        rl-starter-files code.  Not sure what this does.

        :param rewards:
        :param dones:
        """
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(
                dones, dtype=torch.float, device=self.device).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        """
        rl-starter-files code.  Not sure what this does (other than wrap analyze_feedbacks().

        :param reward:
        :param done:
        :return:
        """
        return self.analyze_feedbacks([reward], [done])
示例#26
0
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""
    def __init__(self,
                 env,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_memory=False,
                 use_text=False):
        obs_space, self.preprocess_obs_goals = utils.get_obs_goals_preprocessor(
            obs_space)
        self.acmodel = ACModel(obs_space,
                               action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        status = utils.get_status(model_dir)

        self.goals = list(status['agent_goals'].values())
        # for goal in self.goals:
        #     goal = env.unwrapped.get_obs_render( goal, tile_size=32)
        #     plt.imshow(goal)
        #     plt.show()

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size,
                                        device=self.device)

        self.acmodel.load_state_dict(status["model_state"])
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obs_goals, "vocab"):
            self.preprocess_obs_goals.vocab.load_vocab(status["vocab"])

    def concat_obs_goal(self, obs):
        if 'image' in obs:
            obs_goals = [{
                "image":
                np.concatenate((obs["image"], self.goals[i]), axis=2),
                "mission":
                obs['mission']
            } for i in range(len(self.goals))]
        else:
            obs_goals = [
                np.concatenate((obs, self.goals[i]), axis=2)
                for i in range(len(self.goals))
            ]
        return obs_goals

    def get_actions(self, obss):
        actions = np.zeros(len(obss), dtype=int)

        for i in range(len(obss)):
            memory = self.memories[i]

            obs_goals = self.concat_obs_goal(obss[i])
            preprocessed_obs_goals = self.preprocess_obs_goals(
                obs_goals, device=self.device)

            with torch.no_grad():
                if self.acmodel.recurrent:
                    memory = torch.stack([memory] * len(self.goals), 0)
                    dists, values, memory = self.acmodel(
                        preprocessed_obs_goals, memory)
                else:
                    dists, values = self.acmodel(preprocessed_obs_goals)
            g = values.data.max(0)[1]
            print(values.data, g)
            if self.argmax:
                actions[i] = dists.probs.max(1,
                                             keepdim=True)[1][g].cpu().numpy()
            else:
                actions[i] = dists.sample()[g].cpu().numpy()

            if self.acmodel.recurrent:
                self.memories[i] = memory[g]

        return actions

    def reset(self):
        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size,
                                        device=self.device)

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(
                dones, dtype=torch.float, device=self.device).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
示例#27
0
    except:
        txt_logger.info("Failed to load pretrained model.\n")
        exit(1)

# Load observations preprocessor
using_gnn = (args.gnn != "GRU" and args.gnn != "LSTM")
obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0], using_gnn, progression_mode)
if "vocab" in status and preprocess_obss.vocab is not None:
    preprocess_obss.vocab.load_vocab(status["vocab"])
txt_logger.info("Observations preprocessor loaded.\n")

# Load model
if use_mem:
    acmodel = RecurrentACModel(envs[0].env, obs_space, envs[0].action_space, args.ignoreLTL, args.gnn, args.dumb_ac, args.freeze_ltl)
else:
    acmodel = ACModel(envs[0].env, obs_space, envs[0].action_space, args.ignoreLTL, args.gnn, args.dumb_ac, args.freeze_ltl)
if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
    txt_logger.info("Loading model from existing run.\n")

elif args.pretrained_gnn:
    acmodel.load_pretrained_gnn(pretrained_status["model_state"])
    txt_logger.info("Pretrained model loaded.\n")

acmodel.to(device)
txt_logger.info("Model loaded.\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo
if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
示例#28
0
# Load observations preprocessor

obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)

if "vocab" in status:
    preprocess_obss.vocab.load_vocab(status["vocab"])
txt_logger.info("Observations preprocessor loaded")

# Load model

if args.model == "ACMLP":
    acmodel = ACMLPModel(obs_space, envs[0].action_space)
elif args.model == "ACNAC":
    acmodel = ACNACModel(obs_space, envs[0].action_space)
else:
    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text, args.use_nac)

if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
acmodel.to(device)
acmodel.eval()

txt_logger.info("Model loaded\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo
if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_alpha, args.optim_eps, preprocess_obss)
elif args.algo == "ppo":
示例#29
0
        else:
            status = utils.get_status(model_dir)
            txt_logger.info("Training status loaded\n")
    except OSError:
        status = {"num_frames": 0, "update": 0}

    # Load observations preprocessor

    obs_space, preprocess_obs_goals = utils.get_obs_goals_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obs_goals.vocab.load_vocab(status["vocab"])
    txt_logger.info("observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
        txt_logger.info("Model loaded\n")
    acmodel.to(device)
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = a2c.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps, preprocess_obs_goals)
    elif args.algo == "ppo":
        algo = ppo.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
示例#30
0
# Load training status

try:
    status = utils.load_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}

# Define actor-critic model

try:
    acmodel = utils.load_model(model_dir)
    logger.info("Model successfully loaded\n")
except OSError:
    if args.model_type == 'standard':
        acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text,
                          args.prev_action, args.manual_memory,
                          args.manual_memory_size)
    elif args.model_type == 'aux':
        acmodel = ACAuxModel(obs_space, envs[0].action_space, args.mem,
                             args.text, args.prev_action, args.manual_memory,
                             args.manual_memory_size, args.aux_context)
    elif args.model_type == 'aux_empower':
        acmodel = ACAuxEmpowerModel(obs_space, envs[0].action_space, args.mem,
                                    args.text, args.prev_action,
                                    args.manual_memory,
                                    args.manual_memory_size, args.aux_context)
    logger.info("Model successfully created\n")
logger.info("{}\n".format(acmodel))

if torch.cuda.is_available():
    acmodel.cuda()