예제 #1
0
def get_cartpole_agent(agent_name, cartpole_device):
    """
    Build a new agent for the specified cartpole device.

    It would probably make more sense to pass agent_parameters
    as a parameter to this function.

    Parameters
    ----------
    agent_name: str
        an identifier this function recognizes: "a2c" or "ppo"
    cartpole_device:

    Return
    ------
        a tensorforce Agent
    """
    if agent_name == "a2c":
        agent_parameters = dict(
            agent=agent_name,
            batch_size=11,
            variable_noise=0.1,
            l2_regularization=
            0.05,  # does this help with catastrophic forgetting?
            horizon=10,  # 10 is good, 1 is bad, 5 is bad, 20 is ok, 15 is bad
            summarizer=dict(
                directory="data/summaries/" + agent_name,
                # list of labels, or 'all'
                labels=[
                    "graph", "entropy", "kl-divergence", "losses", "rewards"
                ],
                frequency=10,  # store values every 10 timesteps
            ),
        )
        agent = Agent.create(
            # agent="a2c",
            environment=cartpole_device.cartpole_env,
            # the cartpole environment will supply argument max_episode_timesteps
            # max_episode_timesteps=max_turns,
            **agent_parameters,
        )
    elif agent_name == "ppo":
        agent_parameters = dict(
            batch_size=10,
            variable_noise=0.1,
        )
        agent = Agent.create(
            agent="ppo",
            environment=cartpole_device.cartpole_env,
            **agent_parameters,
        )
    else:
        raise ValueError(f"agent_name '{agent_name}' is not recognized")

    return agent, agent_parameters
예제 #2
0
def main():

    bad_seeds_environment = Environment.create(environment=BadSeeds03,
                                               seed_count=10,
                                               bad_seed_count=3,
                                               max_episode_length=100)

    agent = Agent.create(
        agent="a2c",
        batch_size=100,  # this seems to help a2c
        horizon=20,  # does this help a2c?
        exploration=0.01,  # tried without this at first
        l2_regularization=0.1,
        entropy_regularization=0.2,
        variable_noise=0.05,
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_01_env_03/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=100000)
    agent.save(directory="saved_models")
예제 #3
0
    def agent(self, agent: any):
        self._agent = Agent.create(agent=agent,
                                   environment=self._tensorforce_environment)

        self._runner = Runner(agent=self._agent,
                              environment=self._tensorforce_environment,
                              save_best_agent=self._save_best_agent)
예제 #4
0
 def __init__(self, in_dim, n_action, rl, train):
     super().__init__()
     self.make_in_port('observation', in_dim)
     self.make_in_port('reward', 1)
     self.make_in_port('done', 1)
     self.make_out_port('action', 1)
     self.make_in_port('token_in', 1)
     self.make_out_port('token_out', 1)
     self.n_action = n_action  # number of action choices
     self.results['action'] = np.array([np.random.randint(n_action)])
     self.model = None
     self.env_type = "MotorEnv"
     self.token = 0
     self.prev_actions = 0
     self.init = True
     self.in_dim = in_dim
     self.rl = rl
     if rl:
         self.env = Environment.create(
             environment=MotorComponent.MotorEnv,
             max_episode_timesteps=train["episode_count"] *
             train["max_steps"],
             n_action=n_action,
             obs_dim=in_dim,
             parent=self)
         self.env.reset()
         self.agent = Agent.create(agent=train['rl_agent'],
                                   environment=self.env)
    def __init__(self,
                 environment: 'TradingEnvironment',
                 agent_spec: any,
                 save_best_agent: bool = False,
                 **kwargs):
        """
        Arguments:
            environment: A `TradingEnvironment` instance for the agent to trade within.
            agent: A `Tensorforce` agent or agent specification.
            save_best_agent (optional): The runner will automatically save the best agent
            kwargs (optional): Optional keyword arguments to adjust the strategy.
        """
        self._max_episode_timesteps = kwargs.get('max_episode_timesteps',
                                                 False)

        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._agent = Agent.create(agent=agent_spec,
                                   environment=self._environment)

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=save_best_agent)
    def agent(self, agent_spec: any):
        self._agent = Agent.create(agent=agent_spec,
                                   environment=self._environment)

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=self._save_best_agent)
예제 #7
0
def main():

    bad_seeds_environment = Environment.create(
        environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100
    )

    agent = Agent.create(
        agent="a2c",
        batch_size=100,
        horizon=100,     # changed from 20 to 100 for agent_03
        exploration=0.05,  # changed from 0.01 to 0.05 for agent_03
        l2_regularization=0.2,  # changed from 0.1 to 0.2 for agent_03
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.1,  # changed from 0.05 to 0.1 for agent_03
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_03_env_03/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
        saver=dict(
            directory='saved_models/agent_03_env_03/checkpoints',
            frequency=600  # save checkpoint every 600 seconds (10 minutes)
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    for _ in range(10):
        runner.run(num_episodes=10000)
        runner.run(num_episodes=1000, evaluation=True)

    bad_seeds_environment.close()
    agent.close()
예제 #8
0
    def train_implementation(self,
                             train_context: easyagents.core.StepsTrainContext):
        """Tensorforce Dqn Implementation of the train loop.

            The implementation follows https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py
        """
        tc = train_context
        train_env = self._create_env()
        network = self._create_network_specification()

        agent_type = 'dqn'
        self.log_api(
            'Agent.create', f'(agent="{agent_type}", ' +
            f'network={network}, ' + f'memory={tc.max_steps_in_buffer}, ' +
            f'start_updating={tc.num_steps_buffer_preload},'
            f'learning_rate={tc.learning_rate}, ' +
            f'batch_size={tc.num_steps_sampled_from_buffer}, ' +
            f'update_frequeny={tc.num_steps_per_iteration}, ' +
            f'discount={tc.reward_discount_gamma})')
        self._agent = Agent.create(
            agent=agent_type,
            environment=train_env,
            network=network,
            memory=tc.max_steps_in_buffer,
            start_updating=tc.num_steps_buffer_preload,
            learning_rate=tc.learning_rate,
            batch_size=tc.num_steps_sampled_from_buffer,
            update_frequency=tc.num_steps_per_iteration,
            discount=tc.reward_discount_gamma,
        )
        self._train_with_runner(train_env, tc)
예제 #9
0
def set_up():
    tensorflow_settings()
    env = Environment.create(environment=CartSeed01,
                             seed_count=10,
                             bad_seed_count=3,
                             max_count=20)

    agent = Agent.create(
        agent="a2c",
        batch_size=10000,
        horizon=50,
        discount=0.97,
        l2_regularization=0.1,
        variable_noise=0.5,
        environment=env,
        summarizer=dict(
            directory="training_data/a2c_cartseed/summaries",
            labels="all",
            frequency=10,
        ),
        # saver=dict(
        #     directory='saved_models/agent_04_env_04_1000/checkpoints',
        #     frequency=600  # save checkpoint every 600 seconds (10 minutes)
        # ),
    )
    return env, agent
예제 #10
0
    def __init__(
        self, agent, environment, max_episode_timesteps=None, evaluation_environment=None,
        save_best_agent=None
    ):
        self.is_environment_external = isinstance(environment, Environment)
        self.environment = Environment.create(
            environment=environment, max_episode_timesteps=max_episode_timesteps
        )

        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.is_eval_environment_external = isinstance(evaluation_environment, Environment)
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment, max_episode_timesteps=max_episode_timesteps
            )
            assert self.evaluation_environment.states() == self.environment.states()
            assert self.evaluation_environment.actions() == self.environment.actions()

        self.is_agent_external = isinstance(agent, Agent)
        self.agent = Agent.create(agent=agent, environment=self.environment)
        self.save_best_agent = save_best_agent

        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
def main():
    tensorflow_settings()
    bad_seeds_environment = Environment.create(
        environment=BadSeeds02,
        seed_count=10,
        bad_seed_count=3,
        history_block=2,
        max_episode_timesteps=100,
    )

    agent = Agent.create(
        agent="random",
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_random_env_02/summaries",
            labels="all",
            frequency=100,  # store values every 100 timesteps
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=10000)

    bad_seeds_environment.close()
    agent.close()
예제 #12
0
def set_up():
    tensorflow_settings()
    bad_seeds_environment = Environment.create(
        environment=BadSeeds02,
        seed_count=10,
        bad_seed_count=3,
        history_block=2,
        max_episode_timesteps=500,
    )

    agent = Agent.create(
        agent="dqn",
        network=[
            dict(type='flatten'),
            dict(type='dense', size=32, activation='tanh'),
            dict(type='dense', size=32, activation='tanh')
        ],
        environment=bad_seeds_environment,
        batch_size=256,
        memory=int(10**7),
        exploration=0.15,
        summarizer=dict(
            directory="training_data/agent_02_env_02/summaries",
            labels="all",
            frequency=100  # store values every 100 timesteps
        ))

    return bad_seeds_environment, agent
예제 #13
0
    def prepare(self,
                environment=None,
                timestep_range=None,
                states=None,
                actions=None,
                exclude_bool_action=False,
                exclude_int_action=False,
                exclude_float_action=False,
                exclude_bounded_action=False,
                require_observe=False,
                require_all=False,
                **agent):
        """
        Generic unit-test preparation.
        """
        Layer.layers = None

        if environment is None:
            if states is None:
                states = deepcopy(self.__class__.states)

            if actions is None:
                actions = deepcopy(self.__class__.actions)
                if exclude_bool_action or self.__class__.exclude_bool_action:
                    actions.pop('bool_action')
                if exclude_int_action or self.__class__.exclude_int_action:
                    actions.pop('int_action')
                if exclude_float_action or self.__class__.exclude_float_action:
                    actions.pop('float_action')
                if exclude_bounded_action or self.__class__.exclude_bounded_action:
                    actions.pop('bounded_action')

            if timestep_range is None:
                timestep_range = self.__class__.timestep_range

            environment = UnittestEnvironment(states=states,
                                              actions=actions,
                                              timestep_range=timestep_range)

        elif timestep_range is not None:
            raise TensorforceError.unexpected()

        environment = Environment.create(environment=environment)

        for key, value in self.__class__.agent.items():
            if key not in agent:
                agent[key] = value

        if self.__class__.require_all or require_all:
            config = None
        elif self.__class__.require_observe or require_observe:
            config = dict(api_functions=['reset', 'act', 'observe'])
        else:
            config = dict(api_functions=['reset', 'act'])

        agent = Agent.create(agent=agent,
                             environment=environment,
                             config=config)

        return agent, environment
예제 #14
0
    def train_implementation(self, train_context: easyagents.core.PpoTrainContext):
        """Tensorforce Ppo Implementation of the train loop.

            The implementation follows https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py
        """
        tc = train_context
        train_env = self._create_env()
        network = self._create_network_specification()

        self.log_api('Agent.create', f'(agent="ppo", environment=..., ' +
                     f'network={network}' +
                     f'learning_rate={tc.learning_rate}, ' +
                     f'batch_size={tc.num_episodes_per_iteration}, ' +
                     f'optimization_steps={tc.num_epochs_per_iteration}, ' +
                     f'discount={tc.reward_discount_gamma})')
        self._agent = Agent.create(
            agent='ppo',
            environment=train_env,
            network=network,
            learning_rate=tc.learning_rate,
            batch_size=tc.num_episodes_per_iteration,
            optimization_steps=tc.num_epochs_per_iteration,
            discount=tc.reward_discount_gamma,
        )
        self._train_with_runner(train_env, tc)
예제 #15
0
    def __init__(self,
                 agent,
                 environment,
                 evaluation_environment=None,
                 save_best_agent=False):
        # save_best overwrites saver...
        self.is_environment_external = isinstance(environment, Environment)
        self.environment = Environment.create(environment=environment)

        self.is_eval_environment_external = isinstance(evaluation_environment,
                                                       Environment)
        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment)

        self.save_best_agent = save_best_agent
        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict()
        if self.save_best_agent is True:
            # Disable periodic saving
            assert not self.is_agent_external
            kwargs = dict(saver=dict(seconds=None, steps=None))
        self.agent = Agent.create(agent=agent,
                                  environment=self.environment,
                                  **kwargs)

        # self.global_episodes = self.agent.episodes
        # self.global_timesteps = self.agent.timesteps
        # self.global_updates = self.agent.updates
        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
예제 #16
0
def set_up():
    tensorflow_settings()
    bad_seeds_environment = Environment.create(
        environment=BadSeedsSkinny,
        seed_count=10,
        bad_seed_count=3,
        history_block=2,
        max_episode_timesteps=100,
    )

    agent = Agent.create(
        agent="a2c",
        network=[
            dict(type='flatten'),
            dict(type='dense', size=32, activation='relu'),
            dict(type='dense', size=32, activation='relu')
        ],
        batch_size=10000,  # changed for 04 but was this a mistake? no
        horizon=50,  # changed from 100 to 50 for agent_04
        discount=0.97,  # new for agent_04
        #exploration=0.05,  # turned off for agent_04 - turn on for 05?
        l2_regularization=0.1,
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.5,  # changed from 0.1 to 0.5 for agent_04
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/a2c_dense_skinny/summaries",
            # list of labels, or 'all'
            labels="all",
            frequency=100,  # store values every 100 timesteps
        ),
    )

    return bad_seeds_environment, agent
    def __init__(self, environment: 'TradingEnvironment', agent_spec: any,
                 **kwargs):
        """
        Arguments:
            environment: A `TradingEnvironment` instance for the agent to trade within.
            agent: A `Tensorforce` agent or agent specification.
            save_best_agent (optional): The runner will automatically save the best agent
            kwargs (optional): Optional keyword arguments to adjust the strategy.
        """
        self._max_episode_timesteps = kwargs.get('max_episode_timesteps',
                                                 False)
        self._save_best_agent = kwargs.get('save_best_agent', False)

        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._agent = Agent.create(
            agent=agent_spec,
            environment=self._environment,
            summarizer=dict(
                directory='data/summaries',
                labels=['graph', 'losses',
                        'rewards'],  # list of labels, or 'all'
                frequency=100  # store values every 100 timesteps
                # (infrequent update summaries every update; other configurations possible)
            ),
        )

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=self._save_best_agent)
예제 #18
0
def base_test(env):
    batch_size = 24

    agent = Agent.create(
        agent='ppo',
        environment=env[0],
        batch_size=batch_size,
        learning_rate=1e-3,
        network=actor_network,
        discount=1.0,
        entropy_regularization=None,
        critic_network=critic_network,
        critic_optimizer=dict(optimizer='adam',
                              multi_step=10,
                              learning_rate=1e-3),
        max_episode_timesteps=n_step,
        parallel_interactions=n_env
        # saver=dict(directory=os.path.join(os.getcwd(), 'saver_data'), frequency=30)
    )

    agent.initialize()

    # Initialize the runner
    runner = ParallelRunner(agent=agent, environments=env)

    # Start the runner
    runner.run(num_episodes=48)
    runner.close()
예제 #19
0
def main():
    # Create an OpenAI-Gym environment
    environment = Environment.create(environment='gym', level='CartPole-v1')

    # Create a PPO agent
    agent = Agent.create(
        agent='dqn',
        environment=environment,
        # memory=100,
        # # Optimization
        # batch_size=10, update_frequency=2, learning_rate=1e-3,
        summarizer=dict(
            directory='data/summaries',
            # list of labels, or 'all'
            labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'],
            frequency=100  # store values every 100 timesteps
            # (infrequent update summaries every update; other configurations possible)
        ),
        recorder=None)

    # Initialize the runner
    runner = Runner(agent=agent, environment=environment)

    # Start the runner
    runner.run(num_episodes=10000)
    runner.close()
예제 #20
0
def runEnv():
    environment = Environment.create(
        environment=CustomEnvironment, max_episode_timesteps=500
    )
    agent = Agent.create(agent='a2c', environment=environment, batch_size=10, learning_rate=1e-3)

    # Train for 200 episodes
    for _ in range(2000):
        states = environment.reset()
        terminal = False
        while CustomEnvironment.extraCounter != 100:
            actions = agent.act(states=states)
            # print(actions)
            # print(states)
            states, reward, terminal = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)

    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(1000):
        states = environment.reset()
        internals = agent.initial_internals()
        terminal = False
        while CustomEnvironment.extraCounter != 100:
            actions, internals = agent.act(states=states, internals=internals, independent=True)
            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward

    # print('Mean episode reward:', sum_rewards / 100)
    # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount)
    print(CustomEnvironment.sum)

    # Close agent and environment
    agent.close()
    environment.close()
예제 #21
0
def run_no_runner(environment, nplayers):
    #with open("rl-regenwormen/agent.json", 'r') as fp:
    #    agent = json.load(fp=fp)

    agents = [
        Agent.create(agent='ppo',
                     batch_size=100,
                     learning_rate=1e-3,
                     exploration=0.2,
                     environment=environment,
                     summarizer=dict(directory='summaries', summaries='all'))
        for i in range(nplayers)
    ]

    print("starting training...")
    i = 10000000
    bar = Bar('Training', max=i)
    rewards = {i: 0 for i in range(nplayers)}
    rewards_total = {i: [] for i in range(nplayers)}
    for episode in range(30000):
        for agent in agents:
            agent.reset()
        states = environment.reset()
        terminal = False
        while not terminal:
            try:
                agent = agents[environment.current_player]
                current_player = environment.current_player
                actions = agent.act(states=states)
                #print(actions)
                states, terminal, reward = environment.execute(actions=actions)
                rewards[environment.current_player] += reward
                rewards_total[environment.current_player] += [reward]
                rewards_total[environment.current_player] = rewards_total[
                    environment.current_player][-300:]
                end_of_roll = environment.current_player != current_player
                agent.observe(terminal=end_of_roll, reward=reward)
                if terminal:
                    for agent2 in agents:
                        if agent2 != agent:
                            actions = agent2.act(states=states)
                            states, terminal, reward = environment.execute(
                                actions=actions)
                            agent2.observe(terminal=True, reward=reward)
            except:
                print(f"ENV {environment.state}")
                print(f"ACT {actions}")
                print(states)
                raise
        names = ["lola", "henry de muis", "pykel", "flo"]
        print({
            names[k]:
            (int(v * 100) / 100, int(np.mean(rewards_total[k]) * 100) / 100)
            for k, v in rewards.items()
        })
        rewards = {i: 0 for i in range(nplayers)}
        bar.next()
    bar.finish()
예제 #22
0
    def test_quickstart(self):
        self.start_tests(name='quickstart')

        # ====================

        # Create an OpenAI-Gym environment
        environment = Environment.create(environment='gym',
                                         level='CartPole-v1')

        # Create a PPO agent
        agent = Agent.create(
            agent='ppo',
            environment=environment,
            # Automatically configured network
            network='auto',
            # Optimization
            batch_size=10,
            update_frequency=2,
            learning_rate=1e-3,
            subsampling_fraction=0.2,
            optimization_steps=5,
            # Reward estimation
            likelihood_ratio_clipping=0.2,
            discount=0.99,
            estimate_terminal=False,
            # Critic
            critic_network='auto',
            critic_optimizer=dict(optimizer='adam',
                                  multi_step=10,
                                  learning_rate=1e-3),
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Start the runner
        runner.run(num_episodes=50, use_tqdm=False)
        runner.close()

        # ====================

        self.finished_test()
def create_deepcrawl_agent(net,
                           baseline,
                           states,
                           actions,
                           args,
                           size_agent=14,
                           size_target=4,
                           num_action=8,
                           num_embs=128):

    agent = Agent.create(
        # Agent type
        agent='ppo',
        # Inputs structure
        states=states,
        # Actions structure
        actions=actions,
        network=net,
        # MemoryModel

        # 10 episodes per update
        batch_size=int(args.update_number),
        # Every 10 episodes
        update_frequency=int(args.update_number),
        max_episode_timesteps=int(args.num_timesteps),

        # DistributionModel
        discount=0.9,
        entropy_regularization=0.00,
        likelihood_ratio_clipping=0.2,
        critic_network=baseline,
        critic_optimizer=dict(type='multi_step',
                              optimizer=dict(type='subsampling_step',
                                             fraction=0.33,
                                             optimizer=dict(
                                                 type='adam',
                                                 learning_rate=5e-4)),
                              num_steps=10),

        # PPOAgent
        learning_rate=5e-5,
        subsampling_fraction=0.33,
        optimization_steps=20,
        execution=None,
        # TensorFlow etc
        name='agent',
        device=None,
        parallel_interactions=1,
        seed=None,
        saver=None,
        summarizer=None,
        recorder=None)

    return agent
예제 #24
0
    def __init__(self,
                 agent,
                 environments,
                 evaluation_environment=None,
                 save_best_agent=False):
        # save_best overwrites saver...
        if not util.is_iterable(x=environments):
            raise TensorforceError.type(name='parallel-runner',
                                        argument='environments',
                                        value=environments)
        elif len(environments) == 0:
            raise TensorforceError.value(name='parallel-runner',
                                         argument='environments',
                                         value=environments)

        self.is_environment_external = tuple(
            isinstance(environment, Environment)
            for environment in environments)
        self.environments = tuple(
            Environment.create(environment=environment)
            for environment in environments)

        self.is_eval_environment_external = isinstance(evaluation_environment,
                                                       Environment)
        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment)

        self.save_best_agent = save_best_agent
        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict(parallel_interactions=len(environments))
        # warning: save_best_agent
        if not self.is_agent_external and self.save_best_agent:
            # Disable periodic saving
            kwargs = dict(saver=dict(seconds=None, steps=None))
        self.agent = Agent.create(agent=agent,
                                  environment=self.environments[0],
                                  **kwargs)
        if not self.agent.model.is_initialized:
            self.agent.initialize()

        # self.global_episodes = self.agent.episodes
        # self.global_timesteps = self.agent.timesteps
        # self.global_updates = self.agent.updates
        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        self.evaluation_rewards = list()
        self.evaluation_timesteps = list()
        self.evaluation_seconds = list()
        self.evaluation_agent_seconds = list()
예제 #25
0
def runEnv():
    environment = Environment.create(environment=CustomEnvironment,
                                     max_episode_timesteps=500)
    agent = Agent.create(
        agent='a2c',
        environment=environment,
        batch_size=10,
        learning_rate=1e-3,
        exploration=0.01,  # tried without this at first
        variable_noise=0.05,
        # variable_noise=0.01 bad?
        l2_regularization=0.1,
        entropy_regularization=0.2,
        summarizer=dict(
            directory='data/summaries',
            # list of labels, or 'all'
            labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'],
            frequency=100,  # store values every 100 timesteps
        ))

    # Train for 200 episodes
    for _ in range(CustomEnvironment.trainingEps):
        print("Episode:  ", _)
        states = environment.reset()
        terminal = False
        while CustomEnvironment.extraCounter != CustomEnvironment.trials:
            actions = agent.act(states=states)
            # print(actions)
            # print(states)
            states, reward, terminal = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)
        print("bad seeds: ", CustomEnvironment.badseedsFinal)

    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(CustomEnvironment.testingEps):
        print("Episode:  ", _ + CustomEnvironment.trainingEps)
        states = environment.reset()
        internals = agent.initial_internals()
        terminal = False
        while CustomEnvironment.extraCounter != CustomEnvironment.trials:
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           independent=True)
            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward
        print("bad seeds: ", CustomEnvironment.badseedsFinal)
    # print('Mean episode reward:', sum_rewards / 100)
    # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount)
    print(CustomEnvironment.sum)

    # Close agent and environment
    agent.close()
    environment.close()
예제 #26
0
def main():
    # Create an OpenAI-Gym environment
    environment = Environment.create(environment='gym', level='CartPole-v1')
    network = _create_network_specification((100, ))

    # Create a PPO agent
    agent = Agent.create(agent='dueling_dqn',
                         environment=environment,
                         network=network)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=10000)
    runner.close()
예제 #27
0
    def test_getting_started(self):
        from tensorforce.agents import Agent
        from tensorforce.environments import Environment

        # Setup environment
        # (Tensorforce or custom implementation, ideally using the Environment interface)
        environment = Environment.create(
            environment='test/data/environment.json')

        # Create and initialize agent
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)
        agent.initialize()

        # Reset agent and environment at the beginning of a new episode
        agent.reset()
        states = environment.reset()
        terminal = False

        # Agent-environment interaction training loop
        while not terminal:
            actions = agent.act(states=states)
            states, terminal, reward = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)

        # ====================

        # Agent-environment interaction evaluation loop
        while not terminal:
            actions = agent.act(states=states, evaluation=True)
            states, terminal, reward = environment.execute(actions=actions)

        # ====================

        # Close agent and environment
        agent.close()
        environment.close()

        # ====================

        from tensorforce.execution import Runner

        # Tensorforce runner utility
        runner = Runner(agent='test/data/agent.json',
                        environment='test/data/environment.json')

        # Run training
        runner.run(num_episodes=50, use_tqdm=False)

        # Close runner
        runner.close()

        self.finished_test()
예제 #28
0
def main():

    bad_seeds_environment = Environment.create(environment=Bollux,
                                               seed_count=10,
                                               bad_seed_count=3,
                                               max_episode_length=100)

    # 20200820-223031
    # 20200820-233243

    # batch_size 1000 goes not get smarter or dumber
    # batch_size 100 20200821-095410 gets dumber
    # try batch size 10000 !

    agent = Agent.create(
        agent="a2c",
        batch_size=10000,  # changed for 04 but was this a mistake? no
        horizon=50,  # changed from 100 to 50 for agent_04
        discount=0.97,  # new for agent_04
        #exploration=0.05,  # turned off for agent_04 - turn on for 05?
        l2_regularization=0.1,
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.5,  # changed from 0.1 to 0.5 for agent_04
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_04_bollux_1000000/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
        saver=dict(
            directory='saved_models/agent_04_bollux_1000000/checkpoints',
            frequency=6000  # save checkpoint every 6000 seconds (100 minutes)
        ),
    )

    # this is the batch_size = 10000 version
    # I hope it is the last env 04
    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=1000000)
    #for i in range(100):
    #    print("running 10000 episodes")
    #    runner.run(num_episodes=10000)
    #    print("saving the agent")
    #    directory = Path(f"saved_models/agent_04_env_04_1000000/10000_{i}/checkpoints")
    #    if directory.exists():
    #        directory.rmdir()
    #    directory.mkdir(parents=True, exist_ok=True)
    #    agent.save(directory=str(directory), format="numpy")

    bad_seeds_environment.close()
    agent.close()
예제 #29
0
    def test_readme(self):
        self.start_tests(name='readme')

        environment = UnittestEnvironment(states=dict(type='float',
                                                      shape=(10, )),
                                          actions=dict(type='int',
                                                       shape=(),
                                                       num_values=5),
                                          timestep_range=(1, 5))

        def get_current_state():
            return environment.reset()

        def execute_decision(x):
            return environment.execute(actions=x)[2]

        # ==========

        from tensorforce.agents import Agent

        # Instantiate a Tensorforce agent
        agent = Agent.create(agent='tensorforce',
                             states=dict(type='float', shape=(10, )),
                             actions=dict(type='int', num_values=5),
                             max_episode_timesteps=100,
                             memory=10000,
                             update=dict(unit='timesteps', batch_size=64),
                             optimizer=dict(type='adam', learning_rate=3e-4),
                             policy=dict(network='auto'),
                             objective='policy_gradient',
                             reward_estimation=dict(horizon=20))

        # Initialize the agent
        agent.initialize()

        # Retrieve the latest (observable) environment state
        state = get_current_state()  # (float array of shape [10])

        # Query the agent for its action decision
        action = agent.act(states=state)  # (scalar between 0 and 4)

        # Execute the decision and retrieve the current performance score
        reward = execute_decision(action)  # (any scalar float)

        # Pass feedback about performance (and termination) to the agent
        agent.observe(reward=reward, terminal=False)

        # ==========

        agent.close()
        environment.close()
        self.finished_test()
예제 #30
0
def main():
    # Create a Gym environment in FruitAPI and converts it to a TensorForce environment
    fruit_env = GymEnvironment(env_name='CartPole-v1')
    environment = TensorForcePlugin.convert(fruit_env)

    # Create a PPO agent
    agent = Agent.create(
        agent='ppo',
        environment=environment,
        # Automatically configured network
        network='auto',
        # Optimization
        batch_size=10,
        update_frequency=2,
        learning_rate=1e-3,
        subsampling_fraction=0.2,
        optimization_steps=5,
        # Reward estimation
        likelihood_ratio_clipping=0.2,
        discount=0.99,
        estimate_terminal=False,
        # Critic
        critic_network='auto',
        critic_optimizer=dict(optimizer='adam',
                              multi_step=10,
                              learning_rate=1e-3),
        # Preprocessing
        preprocessing=None,
        # Exploration
        exploration=0.0,
        variable_noise=0.0,
        # Regularization
        l2_regularization=0.0,
        entropy_regularization=0.0,
        # TensorFlow etc
        name='agent',
        device=None,
        parallel_interactions=1,
        seed=None,
        execution=None,
        saver=None,
        summarizer=None,
        recorder=None)

    # Initialize the runner
    runner = Runner(agent=agent, environment=environment)

    # Start the runner
    runner.run(num_episodes=300)
    runner.close()