def test_mpc_and_model(model, args, L):
    """ Tests the model (is MPC helping?)
    """
    L.log(f"\n\n== Testing trained ensemble model with MPC planner  == \n\n")
    utils.seed(args.seed)

    # Environment
    env = Env(
        args.env_name,
        max_episode_len=args.max_episode_len,
        action_repeat=args.action_repeat,
        seed=args.seed,
    )
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    # MPC Planner
    reward_measure = RewardMeasure(env, args.reward_scale)
    expl_measure = None
    mpc_agent = MpcAgent(
        model,
        args.ensemble_size,
        action_size,
        plan_horizon=args.plan_horizon,
        optimisation_iters=args.optimisation_iters,
        n_candidates=args.n_candidates,
        top_candidates=args.top_candidates,
        reward_measure=reward_measure,
        expl_measure=expl_measure,
        device=DEVICE,
    )

    # Data samplers
    random_sampler = RandomSampler(env)
    hybrid_sampler = ControlSampler(env, mpc_agent)

    # Logging
    mpc_log_fn = lambda step, reward: L.log(f"Collect Step {step}: {reward}")
    train_log_fn = lambda epoch, loss: L.log(f"Train Epoch {epoch}: {loss}")

    # Main loop
    rewards = []
    global_step = 0
    for episode in range(args.n_test_epi):
        L.log(f"\nEpisode {episode}")

        # Test agent
        L.log(f"Testing on {args.n_test_episodes} episodes")

        sac_agent.toggle_updates(False)
        sac_agent.toggle_stochastic(False)

        stats = hybrid_sampler.sample_episodes(args.n_eval_episodes,
                                               action_noise=None,
                                               log_fn=mpc_log_fn,
                                               log_every=args.mpc_log_every)
        L.log_episode(stats['rewards'], stats['steps'])
        L.save()

    return rewards
def train_sac_and_model(args, L):
    """ Trains SAC agents and ensemble model 
        Data is collected via SAC agent 
    """
    L.log(f"\n\n== Training SAC agents and ensemble model == \n\n")
    utils.seed(args.seed)

    # Environment
    env = Env(
        args.env_name,
        max_episode_len=args.max_episode_len,
        action_repeat=args.action_repeat,
        seed=args.seed,
    )
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    # Buffer
    normalizer = Normalizer()
    buffer = Buffer(
        state_size,
        action_size,
        args.ensemble_size,
        args.batch_size,
        normalizer=normalizer,
        buffer_size=args.buffer_size,
        device=DEVICE,
    )

    # Model
    model = EnsembleDynamicsModel(
        state_size + action_size,
        state_size,
        args.ensemble_size,
        args.hidden_size,
        normalizer=normalizer,
        device=DEVICE,
    )

    trainer = Trainer(
        model,
        buffer,
        n_train_epochs=args.n_train_epochs,
        batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        epsilon=args.epsilon,
        grad_clip_norm=args.grad_clip_norm,
    )

    # SAC agents
    sac_agents = [
        make_sac_agent(
            state_shape=env.observation_space.shape,
            action_shape=env.action_space.shape,
            args=args,
            device=DEVICE,
        ) for _ in range(args.n_sac_agents)
    ]
    sac_agent = EnsembleSacAgent(sac_agents, buffer)

    # Data sampler
    random_sampler = RandomSampler(env)
    sac_sampler = ControlSampler(env, sac_agent)

    # Logging
    sac_log_fn = lambda step, reward: L.log(f"Collect Step {step}: {reward}")
    train_log_fn = lambda epoch, loss: L.log(f"Train Epoch {epoch}: {loss}")

    # Collect random data
    random_sampler.sample_record_episodes(args.n_seed_episodes, buffer)
    L.log(f"Collected {buffer.current_size} seed frames")

    # Main loop
    rewards = []
    global_step = 0
    for episode in range(args.n_train_epi):
        L.log(f"\nEpisode {episode} [{buffer.current_size} frames]")

        # Collect data
        L.log(f"Collecting {args.n_collect_episodes} episodes of data")

        sac_agent.toggle_updates(True)
        sac_agent.toggle_stochastic(True)

        buffer, stats = sac_sampler.sample_record_episodes(
            args.n_collect_episodes,
            buffer,
            action_noise=None,
            log_fn=sac_log_fn,
            log_every=args.mpc_log_every,
        )
        L.log_episode(stats['rewards'][0], stats['steps'][0])
        L.save()

    # Train Model
    n_batches = buffer.current_size // args.batch_size
    L.log(
        f"\nTraining on ({n_batches * args.batch_size}) frames ({n_batches}) batches | buffer size ({buffer.current_size})\n"
    )
    trainer.train(n_batches=n_batches,
                  log_fn=train_log_fn,
                  log_every=args.train_log_every)
    sac_agent.save(L.path)
    trainer.save_models(L.path)

    return sac_agent, model
Пример #3
0
def main(config):
    utils.seed(config.seed)

    env = Env(
        config.env_name,
        max_episode_len=config.max_episode_len,
        action_repeat=config.action_repeat,
        seed=config.seed,
    )
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    buffer = Buffer(
        state_size,
        action_size,
        None,
        config.batch_size,
        buffer_size=config.buffer_size,
        n_augments=config.n_augments,
        augment_std=config.augment_std,
        reward_std=config.reward_std,
        sample_jitter=config.sample_jitter,
        device=DEVICE,
    )

    sac_agents = [
        make_sac_agent(
            state_shape=env.observation_space.shape,
            action_shape=env.action_space.shape,
            args=config,
            device=DEVICE,
        ) for _ in range(config.n_sac_agents)
    ]
    sac_agent = EnsembleSacAgent(sac_agents, buffer)

    random_sampler = RandomSampler(env)
    sac_sampler = ControlSampler(env, sac_agent)

    sac_log_fn = lambda step, reward: print(f"Collect Step {step}: {reward}")
    train_log_fn = lambda epoch, loss: print(f"Train Epoch {epoch}: {loss}")

    random_sampler.sample_record_episodes(config.n_seed_episodes, buffer)
    print(f"Collected {buffer.current_size} seed frames")

    rewards = []
    global_step = 0
    for episode in range(config.n_episodes):
        print(f"\nEpisode {episode} [{buffer.current_size} frames]")

        print(f"Collecting {config.n_collect_episodes} episodes of data")
        sac_agent.toggle_updates(True)
        sac_agent.toggle_stochastic(True)
        buffer, stats = sac_sampler.sample_record_episodes(
            config.n_collect_episodes,
            buffer,
            log_fn=sac_log_fn,
            log_every=config.sac_log_every,
        )
        print(f"Train reward: {stats['rewards']} Steps: {stats['steps']}")

        print(f"Testing on {config.n_test_episodes} episodes")
        sac_agent.toggle_updates(False)
        sac_agent.toggle_stochastic(False)
        stats = sac_sampler.sample_episodes(config.n_eval_episodes,
                                            log_fn=sac_log_fn,
                                            log_every=config.sac_log_every)
        print(f"Test reward: {stats['rewards']} steps: {stats['steps']}")
        rewards.append(stats["rewards"][0])
        #save properly
        if episode % 100 == 0:
            subprocess.call(['echo', "rewards: " + str(stats["rewards"][0])])
            now = datetime.now()
            current_time = str(now.strftime("%H:%M:%S"))
            subprocess.call(
                ['echo', "saving rewards at time: " + str(current_time)])
            np.save(config.logdir + "/rewards.npy",
                    np.array(deepcopy(rewards)))
            subprocess.call([
                'rsync', '--archive', '--update', '--compress', '--progress',
                str(config.logdir) + "/",
                str(config.savedir)
            ])
            print("Rsynced files from: " + str(config.logdir) + "/ " + " to" +
                  str(config.savedir))

    return rewards
Пример #4
0
def main(config):
    utils.seed(config.seed)

    env = Env(
        config.env_name,
        max_episode_len=config.max_episode_len,
        action_repeat=config.action_repeat,
        seed=config.seed,
    )
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    normalizer = Normalizer()
    buffer = Buffer(
        state_size,
        action_size,
        config.ensemble_size,
        config.batch_size,
        normalizer=normalizer,
        buffer_size=config.buffer_size,
        device=DEVICE,
    )

    model = EnsembleDynamicsModel(
        state_size + action_size,
        state_size,
        config.ensemble_size,
        config.hidden_size,
        normalizer=normalizer,
        device=DEVICE,
    )

    trainer = Trainer(
        model,
        buffer,
        n_train_epochs=config.n_train_epochs,
        batch_size=config.batch_size,
        learning_rate=config.learning_rate,
        epsilon=config.epsilon,
        grad_clip_norm=config.grad_clip_norm,
    )

    reward_measure = RewardMeasure(env, config.reward_scale)
    expl_measure = None
    mpc_agent = MpcAgent(
        model,
        config.ensemble_size,
        action_size,
        plan_horizon=config.plan_horizon,
        optimisation_iters=config.optimisation_iters,
        n_candidates=config.n_candidates,
        top_candidates=config.top_candidates,
        reward_measure=reward_measure,
        expl_measure=expl_measure,
        alpha=config.alpha,
        device=DEVICE,
    )

    sac_agents = [
        make_sac_agent(
            state_shape=env.observation_space.shape,
            action_shape=env.action_space.shape,
            args=config,
            device=DEVICE,
        )
        for _ in range(config.n_sac_agents)
    ]

    hybrid_agent = HybridAgent(sac_agents, mpc_agent, model, buffer, action_size, n_sac_updates=config.n_sac_updates, cem_std=config.cem_std, device=DEVICE)

    random_sampler = RandomSampler(env)
    hybrid_sampler = ControlSampler(env, hybrid_agent)

    mpc_log_fn = lambda step, reward: print(f"Collect Step {step}: {reward}")
    train_log_fn = lambda epoch, loss: print(f"Train Epoch {epoch}: {loss}")

    random_sampler.sample_record_episodes(config.n_seed_episodes, buffer)
    print(f"Collected {buffer.current_size} seed frames")

    rewards = []
    global_step = 0
    for episode in range(config.n_episodes):
        print(f"\n=== Episode {episode} [{buffer.current_size} frames] ===")

        n_batches = buffer.current_size // config.batch_size
        print(
            f"\nTraining on ({n_batches * config.batch_size}) frames ({n_batches}) batches (buffer size {buffer.current_size})"
        )
        if config.warm_start is 0:
            trainer.reset_models()
        trainer.train(
            n_batches=n_batches, log_fn=train_log_fn, log_every=config.train_log_every
        )

        warm_up = (episode < config.n_warm_up_episodes)
        print(f"\nCollecting {config.n_collect_episodes} episodes of data [warm up: {warm_up}]")
        
        hybrid_agent.toggle_updates(True)
        # TODO double stochastic?
        hybrid_agent.toggle_stochastic(False)
        hybrid_agent.toggle_warm_up(warm_up)
        
        buffer, stats = hybrid_sampler.sample_record_episodes(
            config.n_collect_episodes,
            buffer,
            action_noise=config.action_noise,
            log_fn=mpc_log_fn,
            log_every=config.mpc_log_every,
        )
        print(f"Train reward: {stats['rewards']} Steps: {stats['steps']}")

        if episode % config.test_every == 0:
            print(f"\nTesting on {config.n_test_episodes} episodes")
            hybrid_agent.toggle_updates(False)
            hybrid_agent.toggle_stochastic(False)
            
            stats = hybrid_sampler.sample_episodes(
                config.n_eval_episodes,
                action_noise=None,
                log_fn=mpc_log_fn,
                log_every=config.mpc_log_every
            )
            print(f"Test reward: {stats['rewards']} steps: {stats['steps']}")
            rewards.append(stats["rewards"][0])

    return rewards
def main(config):
    utils.seed(config.seed)

    env = Env(
        config.env_name,
        max_episode_len=config.max_episode_len,
        action_repeat=config.action_repeat,
        seed=config.seed,
    )
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    normalizer = Normalizer()
    buffer = Buffer(
        state_size,
        action_size,
        config.ensemble_size,
        config.batch_size,
        normalizer=normalizer,
        buffer_size=config.buffer_size,
        device=DEVICE,
    )

    model = EnsembleDynamicsModel(
        state_size + action_size,
        state_size,
        config.ensemble_size,
        config.hidden_size,
        normalizer=normalizer,
        device=DEVICE,
    )

    trainer = Trainer(
        model,
        buffer,
        n_train_epochs=config.n_train_epochs,
        batch_size=config.batch_size,
        learning_rate=config.learning_rate,
        epsilon=config.epsilon,
        grad_clip_norm=config.grad_clip_norm,
    )

    reward_measure = RewardMeasure(env, config.reward_scale)
    expl_measure = None
    mpc_agent = MpcAgent(
        model,
        config.ensemble_size,
        action_size,
        plan_horizon=config.plan_horizon,
        optimisation_iters=config.optimisation_iters,
        n_candidates=config.n_candidates,
        top_candidates=config.top_candidates,
        reward_measure=reward_measure,
        expl_measure=expl_measure,
        device=DEVICE,
    )

    random_sampler = RandomSampler(env)
    mpc_sampler = ControlSampler(env, mpc_agent)

    mpc_log_fn = lambda step, reward: print(f"Collect Step {step}: {reward}")
    train_log_fn = lambda epoch, loss: print(f"Train Epoch {epoch}: {loss}")

    random_sampler.sample_record_episodes(config.n_seed_episodes, buffer)
    print(f"Collected {buffer.current_size} seed frames")

    rewards = []
    global_step = 0
    for episode in range(config.n_episodes):
        print(f"\nEpisode {episode} [{buffer.current_size} frames]")

        n_batches = buffer.current_size // config.batch_size
        print(
            f"Training on ({n_batches * config.batch_size}) frames ({n_batches}) batches ({buffer.current_size})"
        )
        if config.warm_start is 0:
            trainer.reset_models()
        trainer.train(
            n_batches=n_batches, log_fn=train_log_fn, log_every=config.train_log_every
        )

        print(f"Collecting {config.n_collect_episodes} episodes of data")

        buffer, stats = mpc_sampler.sample_record_episodes(
            config.n_collect_episodes,
            buffer,
            action_noise=config.action_noise,
            log_fn=mpc_log_fn,
            log_every=config.mpc_log_every,
        )
        print(f"Train reward: {stats['rewards']} Steps: {stats['steps']}")

        print(f"Testing on {config.n_test_episodes} episodes")
        stats = mpc_sampler.sample_episodes(
            config.n_eval_episodes,
            action_noise=None,
            log_fn=mpc_log_fn,
            log_every=config.mpc_log_every
        )
        print(f"Test reward: {stats['rewards']} steps: {stats['steps']}")
        rewards.append(stats["rewards"][0])

        if episode % 10 == 0:
            trainer.save_models(episode)

    return rewards