Exemplo n.º 1
0
def gym_continuous_pool_setup():
    """
    :return: (exp_spec, playground)
    :rtype: (ExperimentSpec, GymPlayground)
    """
    exp_spec = ExperimentSpec(batch_size_in_ts=20,
                              max_epoch=2,
                              theta_nn_hidden_layer_topology=(2, 2),
                              environment_name='LunarLanderContinuous-v2')
    exp_spec.set_experiment_spec({'pool_capacity': 100})
    playground = GymPlayground(exp_spec.prefered_environment)

    poolmanager = PoolManager(exp_spec, playground)
    timestepsampleOne = TimestepSample(container_id=1, playground=playground)
    timestepsampleTwo = TimestepSample(container_id=2, playground=playground)
    samplebatch = SampleBatch(batch_size=exp_spec.batch_size_in_ts,
                              playground=playground)
    trajectoriespool = TrajectoriesPool(capacity=exp_spec['pool_capacity'],
                                        batch_size=exp_spec.batch_size_in_ts,
                                        playground=playground)

    env = playground.env
    initial_observation = env.reset()

    yield (exp_spec, playground, poolmanager, timestepsampleOne,
           timestepsampleTwo, samplebatch, trajectoriespool, env,
           initial_observation)
Exemplo n.º 2
0
    def __init__(self, experiment_spec: ExperimentSpec, playground: GymPlayground, discounted: bool = True):
        self._exp_spec = experiment_spec
        self._playground_spec = playground.get_environment_spec()
        self.discounted = discounted

        # (nice to have) todo:refactor --> to a speed optimized version:
        #                                                       - init list with max env trj lentgh and value 0.
        #                                                       - Write collected observation with a running index
        #   Check benchmark and proof of concept:
        #         - exploration_and_benchmarking/data_structure/data_structure_prof_of_concept_benchmark.py
        #         - exploration_and_benchmarking/data_structure/list_vs_array_data_structure_benchmark.py
        self.observations = []
        self.actions = []
        self.rewards = []

        self.q_values = None
        self.theReturn = None
        self.lenght = None

        # Internal state
        # (nice to have) todo:refactor --> using the namedtuple InertnalState:
        self._step_count_since_begining_of_training = 0
        self._trj_collected = 0
        self._q_values_computed = False
        self._trj_pending_pop = False
Exemplo n.º 3
0
    def __init__(self, exp_spec: ExperimentSpec, agent_root_dir: str = None):
        """
        Build agent computation graph

        :param exp_spec: Experiment specification regarding NN and algo training hparam plus some environment detail
        :type exp_spec: ExperimentSpec
        :param agent_root_dir: The agent root directory
        :type agent_root_dir: str
        """

        if agent_root_dir is not None:
            self.agent_root_dir = agent_root_dir
        else:
            self._use_hardcoded_agent_root_directory()

        self.exp_spec = exp_spec

        try:
            hec = exp_spec['harderEnvCoeficient']
            self.playground = GymPlayground(
                environment_name=exp_spec.prefered_environment,
                harderEnvCoeficient=hec)
        except KeyError:
            self.playground = GymPlayground(
                environment_name=exp_spec.prefered_environment)
        """ ---- Init computation graph ---- """
        # required placeholder for Agent.play() methode
        self.obs_t_ph = None
        self.policy_pi = None

        self._build_computation_graph()

        not_implemented_msg = "must be set by _build_computation_graph()"
        assert self.obs_t_ph is not None, "self.obs_t_ph {}".format(
            not_implemented_msg)
        assert self.policy_pi is not None, "self.policy_pi {}".format(
            not_implemented_msg)
        """ ---- Setup parameters saving ---- """
        self.saver = tf_cv1.train.Saver()
        self.writer = None
        self.this_run_dir = None
Exemplo n.º 4
0
def gym_discrete_setup():
    exp_spec = ExperimentSpec(batch_size_in_ts=1000,
                              max_epoch=2,
                              theta_nn_hidden_layer_topology=(2, 2))
    playground = GymPlayground('LunarLander-v2')

    trajectory_collector = TrajectoryCollector(exp_spec, playground)
    uni_batch_collector = UniformBatchCollector(
        capacity=exp_spec.batch_size_in_ts)

    env = playground.env
    initial_observation = env.reset()
    yield exp_spec, playground, trajectory_collector, uni_batch_collector, env, initial_observation
# coding=utf-8

from blocAndTools.buildingbloc import GymPlayground, gym_environment_reward_assesment
from blocAndTools.rl_vocabulary import rl_name

vocab = rl_name()

playgroundLunarLanderContinuous = GymPlayground('LunarLanderContinuous-v2')
playgroundMountainCarContinuous = GymPlayground('MountainCarContinuous-v0')
playgroundBipedalWalkerContinuous = GymPlayground('BipedalWalker-v2')
playgroundBipedalWalkerHardcoreContinuous = GymPlayground(
    'BipedalWalkerHardcore-v2')
playgroundPendulum = GymPlayground('Pendulum-v0')
playgroundCartPole = GymPlayground('CartPole-v1')
""" Average reward over 5000 sample in environment:
        CartPole: 							1.0
        Pendulum: 							-6.194059069237778
        MountainCarContinuous: 				-0.03379479089447222
        LunarLanderContinuous: 				-1.8451261821262512
        BipedalWalkerContinuous: 			-0.09389189287451959
        BipedalWalkerHardcoreContinuous: 	-0.21095661603674906
        
    Average reward over 10000 sample in environment:
        CartPole: 							1.0
        Pendulum: 							-6.392986420889935
        MountainCarContinuous: 				-0.033452527858401936
        LunarLanderContinuous: 				-2.1619796013042705
        BipedalWalkerContinuous: 			-0.09298591752622176
        BipedalWalkerHardcoreContinuous: 	-0.056385564141322744
    
    Average reward over 30000 sample in environment:
def test_PoolManager_PRODUCE_MINIBATCH(gym_continuous_pool_setup):
    # region ::Type hint bloc ...
    exp_spec: ExperimentSpec
    playground: GymPlayground
    poolmanager = PoolManager
    timestepsampleOne = TimestepSample
    timestepsampleTwo = TimestepSample
    samplebatch = SampleBatch
    trajectoriespool = TrajectoriesPool
    env: Union[TimeLimit, Any]
    # endregion

    (_, _, _, _, _, _, _, env, initial_observation) = gym_continuous_pool_setup

    POOL_CAPACITY_4 = 4
    exp_spec = ExperimentSpec(batch_size_in_ts=2,
                              max_epoch=2,
                              theta_nn_hidden_layer_topology=(2, 2),
                              environment_name='LunarLanderContinuous-v2')
    exp_spec.set_experiment_spec({'pool_capacity': POOL_CAPACITY_4})
    playground = GymPlayground(exp_spec.prefered_environment)
    poolmanager = PoolManager(exp_spec, playground)

    timestepsample1 = TimestepSample(container_id=1, playground=playground)
    timestepsample2 = TimestepSample(container_id=2, playground=playground)
    timestepsample3 = TimestepSample(container_id=3, playground=playground)
    timestepsample4 = TimestepSample(container_id=4, playground=playground)
    tss_collection = [
        timestepsample1, timestepsample2, timestepsample3, timestepsample4
    ]

    obs_t = initial_observation
    tss: TimestepSample
    for tss in tss_collection:
        act_t, obs_t_prime, rew_t, done_t = step_foward_and_collect(
            env, obs_t, poolmanager)
        tss.replace(obs_t=obs_t,
                    act_t=act_t,
                    obs_t_prime=obs_t_prime,
                    rew_t=rew_t,
                    done_t=done_t)
        obs_t = obs_t_prime

    assert poolmanager.timestep_collected_so_far() == POOL_CAPACITY_4

    minibatch_list1 = poolmanager._trajectories_pool.sample_from_pool_as_list()
    minibatch_list2 = poolmanager._trajectories_pool.sample_from_pool_as_list()

    print(minibatch_list1)
    print(minibatch_list2)

    assert minibatch_list1 != minibatch_list2

    popu = [idx for idx in range(10)]
    popu_sample1 = random.sample(popu, 3)
    popu_sample2 = random.sample(popu, 3)

    assert popu_sample1 != popu_sample2

    minibatch1: SampleBatch = poolmanager.sample_from_pool()
    minibatch2: SampleBatch = poolmanager.sample_from_pool()

    print(minibatch1)
    print(minibatch2)

    flag1 = [tss in minibatch1 for tss in tss_collection]
    flag2 = [tss in minibatch2 for tss in tss_collection]

    assert sum(flag1) == 2
    assert sum(flag2) == 2

    assert minibatch1 != minibatch2