def test_to_pickle():
    data = MDPDataBunch.from_env('CartPole-v0',
                                 render='rgb_array',
                                 bs=5,
                                 max_steps=20,
                                 add_valid=False)
    model = create_dqn_model(data,
                             FixedTargetDQNModule,
                             opt=torch.optim.RMSprop)
    memory = ExperienceReplay(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    learner = dqn_learner(data=data,
                          model=model,
                          memory=memory,
                          exploration_method=exploration_method)
    learner.fit(2)

    assert len(data.x.info) == 2
    assert 0 in data.x.info
    assert 1 in data.x.info

    data.to_pickle('./data/test_to_pickle')
    assert os.path.exists('./data/test_to_pickle_CartPole-v0')
Пример #2
0
def test_resolution_wrapper():
    data = MDPDataBunch.from_env('CartPole-v0',
                                 render='rgb_array',
                                 bs=5,
                                 max_steps=10,
                                 add_valid=False,
                                 memory_management_strategy='k_top',
                                 k=1,
                                 feed_type=FEED_TYPE_IMAGE,
                                 res_wrap=partial(ResolutionWrapper,
                                                  w_step=2,
                                                  h_step=2))
    model = create_dqn_model(data,
                             DQNModule,
                             opt=torch.optim.RMSprop,
                             lr=0.1,
                             channels=[32, 32, 32],
                             ks=[5, 5, 5],
                             stride=[2, 2, 2])
    memory = ExperienceReplay(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    learner = dqn_learner(data=data,
                          model=model,
                          memory=memory,
                          exploration_method=exploration_method,
                          callback_fns=[RewardMetric, EpsilonMetric])
    learner.fit(2)
    temp = gym.make('CartPole-v0')
    temp.reset()
    original_shape = temp.render(mode='rgb_array').shape
    assert data.env.render(mode='rgb_array').shape == (original_shape[0] // 2,
                                                       original_shape[1] // 2,
                                                       3)
Пример #3
0
def test_dataset_memory_manager(memory_strategy, k):
    data = MDPDataBunch.from_env('CartPole-v0',
                                 render='rgb_array',
                                 bs=5,
                                 max_steps=20,
                                 add_valid=False,
                                 memory_management_strategy=memory_strategy,
                                 k=k)
    model = create_dqn_model(data, DQNModule, opt=torch.optim.RMSprop, lr=0.1)
    memory = ExperienceReplay(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    learner = dqn_learner(data=data,
                          model=model,
                          memory=memory,
                          exploration_method=exploration_method,
                          callback_fns=[RewardMetric, EpsilonMetric])
    learner.fit(10)

    data_info = {
        episode: data.train_ds.x.info[episode]
        for episode in data.train_ds.x.info if episode != -1
    }
    full_episodes = [
        episode for episode in data_info if not data_info[episode][1]
    ]

    assert sum([not _[1] for _ in data_info.values()
                ]) == k, 'There should be k episodes but there is not.'
    if memory_strategy.__contains__(
            'top') and not memory_strategy.__contains__('both'):
        assert (np.argmax([_[0] for _ in data_info.values()])) in full_episodes
Пример #4
0
    def __init__(self,
                 data: MDPDataBunch,
                 memory=ExperienceReplay(10000),
                 tau=0.001,
                 batch=64,
                 discount=0.99,
                 lr=0.005):
        """
        Implementation of a continuous control algorithm using an actor/critic architecture.

        Notes:
            Uses 4 networks, 2 actors, 2 critics.
            All models use batch norm for feature invariance.
            Critic simply predicts Q while the Actor proposes the actions to take given a state s.

        References:
            [1] Lillicrap, Timothy P., et al. "Continuous control with deep reinforcement learning."
            arXiv preprint arXiv:1509.02971 (2015).

        Args:
            data: Primary data object to use.
            memory: How big the memory buffer will be for offline training.
            tau: Defines how "soft/hard" we will copy the target networks over to the primary networks.
            batch: Size of per memory query.
            discount: Determines the amount of discounting the existing Q reward.
            lr: Rate that the opt will learn parameter gradients.
        """
        super().__init__(data)
        self.name = 'DDPG'
        self.lr = lr
        self.discount = discount
        self.batch = batch
        self.tao = tau
        self.memory = memory

        self.action_model = self.initialize_action_model([64, 64], data)
        self.critic_model = self.initialize_critic_model([64, 64], data)

        self.opt = OptimWrapper.create(Adam,
                                       lr=lr,
                                       layer_groups=self.action_model)
        self.critic_optimizer = OptimWrapper.create(
            Adam, lr=lr, layer_groups=self.action_model)

        self.t_action_model = self.initialize_action_model([64, 64], data)
        self.t_critic_model = self.initialize_critic_model([64, 64], data)

        self.target_copy_over()

        self.callbacks = [BaseDDPGCallback]

        self.loss_func = MSELoss()
        # TODO Move to Ornstein-Uhlenbeck process
        self.exploration_strategy = GreedyEpsilon(decay=0.001,
                                                  epsilon_end=0.1,
                                                  epsilon_start=1,
                                                  do_exploration=True)
def test_metrics_reward_init():
    data = MDPDataBunch.from_env('CartPole-v0',
                                 render='rgb_array',
                                 bs=5,
                                 max_steps=20)
    model = create_dqn_model(data, DQNModule, opt=torch.optim.RMSprop)
    memory = ExperienceReplay(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    learner = dqn_learner(data=data,
                          model=model,
                          memory=memory,
                          exploration_method=exploration_method,
                          callback_fns=[RewardMetric])
    learner.fit(2)
def test_from_pickle():
    data = MDPDataBunch.from_pickle('./data/test_to_pickle_CartPole-v0')
    model = create_dqn_model(data,
                             FixedTargetDQNModule,
                             opt=torch.optim.RMSprop)
    memory = ExperienceReplay(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    learner = dqn_learner(data=data,
                          model=model,
                          memory=memory,
                          exploration_method=exploration_method)
    learner.fit(2)

    assert len(data.x.info) == 4
    assert 0 in data.x.info
    assert 3 in data.x.info
def trained_learner(model_cls,
                    env,
                    s_format,
                    experience,
                    bs,
                    layers,
                    memory_size=1000000,
                    decay=0.001,
                    copy_over_frequency=300,
                    lr=None,
                    epochs=450,
                    **kwargs):
    if lr is None: lr = [0.001, 0.00025]
    memory = experience(memory_size=memory_size, reduce_ram=True)
    explore = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=decay)
    if type(lr) == list: lr = lr[0] if model_cls == DQNModule else lr[1]
    data = MDPDataBunch.from_env(env,
                                 render='human',
                                 bs=bs,
                                 add_valid=False,
                                 keep_env_open=False,
                                 feed_type=s_format,
                                 memory_management_strategy='k_partitions_top',
                                 k=3,
                                 **kwargs)
    if model_cls == DQNModule:
        model = create_dqn_model(data=data,
                                 base_arch=model_cls,
                                 lr=lr,
                                 layers=layers,
                                 opt=optim.RMSprop)
    else:
        model = create_dqn_model(data=data,
                                 base_arch=model_cls,
                                 lr=lr,
                                 layers=layers)
    learn = dqn_learner(data,
                        model,
                        memory=memory,
                        exploration_method=explore,
                        copy_over_frequency=copy_over_frequency,
                        callback_fns=[RewardMetric, EpsilonMetric])
    learn.fit(epochs)
    return learn
Пример #8
0
    def __init__(self,
                 data: MDPDataBunch,
                 memory=None,
                 batch_size=32,
                 lr=0.001,
                 grad_clip=5,
                 max_episodes=None):
        """Trains an Agent using the Q Learning method on a neural net.

        Notes:
            This is not a true implementation of [1]. A true implementation uses a fixed target network.

        References:
            [1] Mnih, Volodymyr, et al. "Playing atari with deep reinforcement learning."
            arXiv preprint arXiv:1312.5602 (2013).

        Args:
            data: Used for size input / output information.
        """
        super().__init__(data)
        # TODO add recommend cnn based on state size?
        self.name = 'DQN'
        self.batch_size = batch_size
        self.discount = 0.99
        self.lr = lr
        self.gradient_clipping_norm = grad_clip
        self.loss_func = F.smooth_l1_loss
        self.memory = ifnone(memory, ExperienceReplay(100000))
        self.action_model = self.initialize_action_model([64, 64], data)
        self.opt = OptimWrapper.create(optim.Adam,
                                       lr=self.lr,
                                       layer_groups=self.action_model)
        self.learner_callbacks += [
            partial(BaseDQNCallback, max_episodes=max_episodes)
        ] + self.memory.callbacks
        self.exploration_strategy = GreedyEpsilon(epsilon_start=1,
                                                  epsilon_end=0.1,
                                                  decay=0.001,
                                                  do_exploration=self.training)
def test_dqn_dqn_learner(model_cls, s_format, mem, env):
    data = MDPDataBunch.from_env(env,
                                 render='rgb_array',
                                 bs=32,
                                 add_valid=False,
                                 keep_env_open=False,
                                 feed_type=s_format)
    model = create_dqn_model(data, model_cls)
    memory = mem(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    dqn_learner(data=data,
                model=model,
                memory=memory,
                exploration_method=exploration_method)

    assert config_env_expectations[env]['action_shape'] == (
        1, data.action.n_possible_values.item())
    if s_format == FEED_TYPE_STATE:
        assert config_env_expectations[env][
            'state_shape'] == data.state.s.shape
def test_export_learner():
    data = MDPDataBunch.from_env('CartPole-v0',
                                 render='rgb_array',
                                 bs=5,
                                 max_steps=20,
                                 add_valid=False)
    model = create_dqn_model(data,
                             FixedTargetDQNModule,
                             opt=torch.optim.RMSprop)
    memory = ExperienceReplay(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    learner = dqn_learner(data=data,
                          model=model,
                          memory=memory,
                          exploration_method=exploration_method)
    learner.fit(2)

    learner.export('test_export.pkl')  #, pickle_data=True)
    learner = load_learner(learner.path, 'test_export.pkl')
    learner.fit(2)
Пример #11
0
def test_databunch_to_pickle():
    data = MDPDataBunch.from_env('CartPole-v0',
                                 render='rgb_array',
                                 bs=5,
                                 max_steps=20,
                                 add_valid=False,
                                 memory_management_strategy='k_partitions_top',
                                 k=3)
    model = create_dqn_model(data, DQNModule, opt=torch.optim.RMSprop, lr=0.1)
    memory = ExperienceReplay(memory_size=1000, reduce_ram=True)
    exploration_method = GreedyEpsilon(epsilon_start=1,
                                       epsilon_end=0.1,
                                       decay=0.001)
    learner = dqn_learner(data=data,
                          model=model,
                          memory=memory,
                          exploration_method=exploration_method,
                          callback_fns=[RewardMetric, EpsilonMetric])
    learner.fit(10)
    data.to_pickle('./data/cartpole_10_epoch')
    MDPDataBunch.from_pickle(env_name='CartPole-v0',
                             path='./data/cartpole_10_epoch')