示例#1
0
    def __init__(self, params, experience_replay_buffer,metrics,results_dir,env):
        self.parms = params     
        self.D = experience_replay_buffer  
        self.metrics = metrics
        self.env = env
        self.tested_episodes = 0

        self.statistics_path = results_dir+'/statistics' 
        self.model_path = results_dir+'/model' 
        self.video_path = results_dir+'/video' 
        self.rew_vs_pred_rew_path = results_dir+'/rew_vs_pred_rew'
        self.dump_plan_path = results_dir+'/dump_plan'
        
        #if folder do not exists, create it
        os.makedirs(self.statistics_path, exist_ok=True) 
        os.makedirs(self.model_path, exist_ok=True) 
        os.makedirs(self.video_path, exist_ok=True) 
        os.makedirs(self.rew_vs_pred_rew_path, exist_ok=True) 
        os.makedirs(self.dump_plan_path, exist_ok=True) 
        

        # Create models
        self.transition_model = TransitionModel(self.parms.belief_size, self.parms.state_size, self.env.action_size, self.parms.hidden_size, self.parms.embedding_size, self.parms.activation_function).to(device=self.parms.device)
        self.observation_model = ObservationModel(self.parms.belief_size, self.parms.state_size, self.parms.embedding_size, self.parms.activation_function).to(device=self.parms.device)
        self.reward_model = RewardModel(self.parms.belief_size, self.parms.state_size, self.parms.hidden_size, self.parms.activation_function).to(device=self.parms.device)
        self.encoder = Encoder(self.parms.embedding_size,self.parms.activation_function).to(device=self.parms.device)
        self.param_list = list(self.transition_model.parameters()) + list(self.observation_model.parameters()) + list(self.reward_model.parameters()) + list(self.encoder.parameters()) 
        self.optimiser = optim.Adam(self.param_list, lr=0 if self.parms.learning_rate_schedule != 0 else self.parms.learning_rate, eps=self.parms.adam_epsilon)
        self.planner = MPCPlanner(self.env.action_size, self.parms.planning_horizon, self.parms.optimisation_iters, self.parms.candidates, self.parms.top_candidates, self.transition_model, self.reward_model,self.env.action_range[0], self.env.action_range[1])

        global_prior = Normal(torch.zeros(self.parms.batch_size, self.parms.state_size, device=self.parms.device), torch.ones(self.parms.batch_size, self.parms.state_size, device=self.parms.device))  # Global prior N(0, I)
        self.free_nats = torch.full((1, ), self.parms.free_nats, dtype=torch.float32, device=self.parms.device)  # Allowed deviation in KL divergence
示例#2
0
def setup_planner(args: argparse.Namespace, env: Env, transition_model: nn.Module, reward_model: nn.Module) -> nn.Module:
    planner = MPCPlanner(
        env.action_size,
        args.planning_horizon,
        args.optimisation_iters,
        args.candidates,
        args.top_candidates,
        transition_model,
        reward_model,
        env.action_range[0],
        env.action_range[1]
    )
    return planner
示例#3
0
    transition_model.load_state_dict(model_dicts['transition_model'])
    observation_model.load_state_dict(model_dicts['observation_model'])
    reward_model.load_state_dict(model_dicts['reward_model'])
    encoder.load_state_dict(model_dicts['encoder'])
    optimiser.load_state_dict(model_dicts['optimiser'])

mode = "continuous"
num_actions = -1
if type(env._env.action_space) == gym.spaces.discrete.Discrete:
    mode = "discrete"
    num_actions = env._env.action_space.n
planner = MPCPlanner(env.action_size,
                     args.planning_horizon,
                     args.optimisation_iters,
                     args.candidates,
                     args.top_candidates,
                     transition_model,
                     reward_model,
                     mode=mode,
                     num_actions=num_actions)
global_prior = Normal(
    torch.zeros(args.batch_size, args.state_size, device=args.device),
    torch.ones(args.batch_size, args.state_size,
               device=args.device))  # Global prior N(0, I)
free_nats = torch.full(
    (1, ), args.free_nats,
    device=args.device)  # Allowed deviation in KL divergence


def update_belief_and_act(args, env, planner, transition_model, encoder,
                          belief, posterior_state, action, observation, test):
示例#4
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available() and not args.disable_cuda:
    args.device = torch.device('cuda')
    torch.cuda.manual_seed(args.seed)
else:
    args.device = torch.device('cpu')
os.makedirs('results', exist_ok=True)
os.makedirs('checkpoints', exist_ok=True)
# Initialise environment, experience replay memory and planner
env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length,
          args.action_repeat)
D = ExperienceReplay(args.experience_size, args.symbolic_env,
                     env.observation_size, env.action_size, args.device)
planner = MPCPlanner(env.action_size, args.planning_horizon,
                     args.optimisation_iters, args.candidates,
                     args.top_candidates)

# Initialise dataset D with S random seed episodes
for s in range(args.seed_episodes):
    observation, done = env.reset(), False
    while not done:
        action = env.sample_random_action()
        next_observation, reward, done = env.step(action)
        D.append(observation, action, reward, done)
        observation = next_observation

# Initialise model parameters randomly
transition_model = TransitionModel(args.belief_size, args.state_size,
                                   env.action_size, args.hidden_size,
                                   args.embedding_size).to(device=args.device)
示例#5
0
param_list = list(transition_model.parameters()) + list(
    observation_model.parameters()) + list(reward_model.parameters()) + list(
        encoder.parameters())
optimiser = optim.Adam(
    param_list,
    lr=0 if args.learning_rate_schedule != 0 else args.learning_rate,
    eps=args.adam_epsilon)
if args.models is not '' and os.path.exists(args.models):
    model_dicts = torch.load(args.models)
    transition_model.load_state_dict(model_dicts['transition_model'])
    observation_model.load_state_dict(model_dicts['observation_model'])
    reward_model.load_state_dict(model_dicts['reward_model'])
    encoder.load_state_dict(model_dicts['encoder'])
    optimiser.load_state_dict(model_dicts['optimiser'])
planner = MPCPlanner(env.action_size, args.planning_horizon,
                     args.optimisation_iters, args.candidates,
                     args.top_candidates, transition_model, reward_model,
                     env.action_range[0], env.action_range[1])
global_prior = Normal(
    torch.zeros(args.batch_size, args.state_size, device=args.device),
    torch.ones(args.batch_size, args.state_size,
               device=args.device))  # Global prior N(0, I)
free_nats = torch.full(
    (1, ), args.free_nats, dtype=torch.float32,
    device=args.device)  # Allowed deviation in KL divergence


def update_belief_and_act(args,
                          env,
                          planner,
                          transition_model,
                          encoder,
示例#6
0
for s in range(cfg['seed_episodes']):
    observation = env.reset()
    done = False
    while not done:
        next_observation, action, reward, done = env.step()
        replay.append(observation, action, reward, done)
        observation = next_observation

# Init PlaNet
transition_model = Transition(cfg)
observation_model = Observation(cfg)
reward_model = Reward(cfg)
encoder = Encoder(cfg)

optim = tf.train.AdamOptimizer(cfg['learning_rate'], epsilon=cfg['optim_eps'])
planner = MPCPlanner(cfg, env.action_size, transition_model, reward_model)
global_prior = tf.distributions.Normal(
    tf.zeros([cfg['batch_size'], cfg['state_size']]),
    tf.ones([cfg['batch_size'], cfg['state_size']]))  # Global prior N(0, I)
free_nats = tf.fill(dims=[
    1,
], value=cfg['free_nats'])  # Allowed deviation in KL divergence

# Training
for episode in trange(cfg['train']['episodes']):
    # Model fitting
    losses = []
    for _ in trange(cfg['collect_interval']):
        # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)}
        obs, actions, rewards, nonterminals = replay.sample()
        # Create initial belief and state for time t = 0