예제 #1
0
def run_task(arg_vv, log_dir, exp_name):
    if arg_vv['algorithm'] == 'planet':
        from planet.config import DEFAULT_PARAMS
    elif arg_vv['algorithm'] == 'dreamer':
        from dreamer.config import DEFAULT_PARAMS
    else:
        raise NotImplementedError

    vv = DEFAULT_PARAMS
    vv.update(**arg_vv)
    vv = update_env_kwargs(vv)
    vv['max_episode_length'] = vv['env_kwargs']['horizon']

    # Configure logger
    logger.configure(dir=log_dir, exp_name=exp_name)
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Configure torch
    if torch.cuda.is_available():
        device = torch.device('cuda:1') if torch.cuda.device_count(
        ) > 1 else torch.device('cuda:0')
        torch.cuda.manual_seed(vv['seed'])
    else:
        device = torch.device('cpu')

    # Dump parameters
    with open(osp.join(logger.get_dir(), 'variant.json'), 'w') as f:
        json.dump(vv, f, indent=2, sort_keys=True)
    env = Env(vv['env_name'],
              vv['symbolic_env'],
              vv['seed'],
              vv['max_episode_length'],
              vv['action_repeat'],
              vv['bit_depth'],
              vv['image_dim'],
              env_kwargs=vv['env_kwargs'])

    if vv['algorithm'] == 'planet':
        from planet.planet_agent import PlaNetAgent
        agent = PlaNetAgent(env, vv, device)
        agent.train(train_epoch=vv['train_epoch'])
        env.close()
    elif vv['algorithm'] == 'dreamer':
        from dreamer.dreamer_agent import DreamerAgent
        agent = DreamerAgent(env, vv, device)
        agent.train(train_episode=vv['train_episode'])
        env.close()
예제 #2
0
            for t in pbar:
                belief, posterior_state, action, observation, reward, done = update_belief_and_act(args, env, planner,
                                                                                                   transition_model,
                                                                                                   encoder, belief,
                                                                                                   posterior_state,
                                                                                                   action,
                                                                                                   observation.to(
                                                                                                       device=args.device))
                total_reward += reward
                if args.render:
                    env.render()
                if done:
                    pbar.close()
                    break
    print('Average Reward:', total_reward / args.test_episodes)
    env.close()
    quit()

# Training (and testing)
for episode in tqdm(range(metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes,
                    initial=metrics['episodes'][-1] + 1):
    # Model fitting
    losses = []
    for s in tqdm(range(args.collect_interval)):
        # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
        observations, actions, rewards, nonterminals = D.sample(args.batch_size,
                                                                args.chunk_size)  # Transitions start at time t = 0
        # Create initial belief and state for time t = 0
        init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros(
            args.batch_size, args.state_size, device=args.device)
        # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)