Exemplo n.º 1
0
 def setup(self, dbars: Any) -> Any:
     trainingEnvironment = Environment.create(
         environment=TradingEnvironment(dbars),
     )
     self.agent = Agent.create(
         agent=PPOAgent,
         environment=trainingEnvironment,  # alternatively: states, actions, (max_episode_timesteps)
         update=dict(
             unit='timesteps', 
             batch_size=64
         ),
         network="auto",
         ## exploration=?,
         reward_estimation=dict(
             horizon=20
             # discount=?,
         ),
         learning_rate=3e-4,
         # likelihood_ratio_clipping=?,
         # subsampling_fraction=?,
         # multi_step=?
         summarizer=dict(
             directory='./tensorboard/'
         )
     )
     self.agent.save(directory='model-numpy', format='checkpoint', append='episodes')
     ## Train!
     runner = Runner(self.agent, environment=trainingEnvironment)
     runner.run(
         num_episodes=10000,
         save_best_agent='./best-agent/'
     )
     trainingEnvironment.close()
     ## Prepare agent for trading
     self.internal_state = self.agent.initial_internals()
Exemplo n.º 2
0
    def unittest(self,
                 num_updates=None,
                 num_episodes=None,
                 num_timesteps=None,
                 environment=None,
                 min_timesteps=None,
                 states=None,
                 actions=None,
                 exclude_bool_action=False,
                 exclude_int_action=False,
                 exclude_float_action=False,
                 exclude_bounded_action=False,
                 require_observe=False,
                 require_all=False,
                 **agent):
        """
        Generic unit-test.
        """
        agent, environment = self.prepare(
            environment=environment,
            min_timesteps=min_timesteps,
            states=states,
            actions=actions,
            exclude_bool_action=exclude_bool_action,
            exclude_int_action=exclude_int_action,
            exclude_float_action=exclude_float_action,
            exclude_bounded_action=exclude_bounded_action,
            require_observe=require_observe,
            require_all=require_all,
            **agent)

        self.runner = Runner(agent=agent, environment=environment)

        assert (num_updates is not None) + (num_episodes is not None) + \
            (num_timesteps is not None) <= 1
        if num_updates is None and num_episodes is None and num_timesteps is None:
            num_updates = self.__class__.num_updates
            num_episodes = self.__class__.num_episodes
            num_timesteps = self.__class__.num_timesteps
        if num_updates is None and num_episodes is None and num_timesteps is None:
            num_updates = 2
        assert (num_updates is not None) + (num_episodes is not None) + \
            (num_timesteps is not None) == 1

        evaluation = not any([
            require_all, require_observe, self.__class__.require_all,
            self.__class__.require_observe
        ])
        self.runner.run(num_episodes=num_episodes,
                        num_timesteps=num_timesteps,
                        num_updates=num_updates,
                        use_tqdm=False,
                        evaluation=evaluation)
        self.runner.close()
        agent.close()
        environment.close()

        self.finished_test()
Exemplo n.º 3
0
def get_agent_and_runner(max_timesteps=EPISODE_MAX_LENGTH):
    max_timesteps = EPISODE_MAX_LENGTH if max_timesteps is None else max_timesteps
    # OpenAI-Gym environment specification
    gym_environment = gym.make(LEVEL, render=True)
    gym_environment = TimeLimit(gym_environment.unwrapped,
                                max_episode_steps=max_timesteps)
    # gym_environment = Monitor(gym_environment, RECORD_DICT, force=True)

    environment = Environment.create(
        environment=gym_environment,
        max_episode_timesteps=gym_environment.spec.max_episode_steps,
    )

    agent = Agent.create(
        agent='a2c',
        environment=environment,
        # parallel_interactions=PARALLEL,
        # Automatically configured network
        # network='auto',
        network=[
            dict(type='dense', size=256, activation='tanh'),
            dict(type='dense', size=256, activation='tanh'),
            dict(type='dense', size=256, activation='tanh'),
        ],
        # AC optimization parameters
        batch_size=256,
        update_frequency=2,
        learning_rate=0.001,
        # Reward estimation
        discount=0.99,
        predict_terminal_values=False,
        # Regularization
        l2_regularization=1.0,
        entropy_regularization=0.0,
        # Preprocessing
        state_preprocessing='linear_normalization',
        reward_preprocessing=None,
        # Exploration
        exploration=0.3,
        variable_noise=0.2,
        # Default additional config values
        config=None,
        # Save agent every 10 updates and keep the 5 most recent checkpoints
        saver=dict(directory=MODEL_DICT, frequency=10, max_checkpoints=5),
        # Log all available Tensorboard summaries
        summarizer=dict(directory=SUMMARY_DICT, summaries='all'),
        # Do not record agent-environment interaction trace
        recorder=None  # RECORD_DICT
    )

    # Initialize the runner
    runner = Runner(
        agent=agent,
        environment=environment,
        max_episode_timesteps=gym_environment.spec.max_episode_steps,
        # num_parallel=PARALLEL,
        # remote="multiprocessing"
    )

    return agent, runner
def main():
    # Non-vectorized runner
    runner = Runner(agent='benchmarks/configs/ppo.json',
                    environment=VectorizedEnvironment,
                    max_episode_timesteps=10)
    runner.run(num_episodes=1000)

    # Vectorized runner, automatically if num_parallel > 1 and environment.is_vectorizable()
    # (and remote argument not specified)
    runner = Runner(agent='benchmarks/configs/ppo.json',
                    environment=VectorizedEnvironment,
                    max_episode_timesteps=10,
                    num_parallel=16)
    runner.run(num_episodes=1000)
Exemplo n.º 5
0
    def unittest(self,
                 environment=None,
                 states=None,
                 actions=None,
                 num_episodes=None,
                 **agent):
        """
        Generic unit-test.
        """
        if environment is None:
            environment = self.environment_spec(states=states, actions=actions)
            max_episode_timesteps = environment.pop(
                'max_episode_timesteps')  # runner argument

        else:
            max_episode_timesteps = self.__class__.max_episode_timesteps

        agent = self.agent_spec(**agent)

        if num_episodes is None:
            num_updates = 2
        else:
            num_updates = None

        runner = Runner(agent=agent,
                        environment=environment,
                        max_episode_timesteps=max_episode_timesteps)
        runner.run(num_episodes=num_episodes,
                   num_updates=num_updates,
                   use_tqdm=False)
        runner.close()

        self.finished_test()
Exemplo n.º 6
0
def record_ppo_config(directory):
    # Start recording traces after 80 episodes -- by then, the environment is solved
    runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json',
                               recorder=dict(directory=directory, start=80)),
                    environment='benchmarks/configs/cartpole.json')
    runner.run(num_episodes=100)
    runner.close()
Exemplo n.º 7
0
def main():
    # Record experience traces
    record_ppo_config(directory='ppo-traces')
    # Alternatively:
    # record_custom_act_function(directory='ppo-traces')
    # write_custom_recording_file(directory='ppo-traces')

    # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
    # experience of one episode to the agent and subsequently perform one update
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)
    agent.pretrain(directory='ppo-traces',
                   num_iterations=30,
                   num_traces=1,
                   num_updates=1)

    # Evaluate the pretrained agent
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()

    # Close agent and environment
    agent.close()
    environment.close()
Exemplo n.º 8
0
def socket():
    """
    Train agent on experience collected in parallel from 2 CartPole environments running on
    another machine.

    Typical use case: same as mode 2, but generally remote communication socket > process

    Simulate remote environment, usually run on another machine via:
        python run.py --environment gym --level CartPole-v1 --remote socket-server --port 65432
    """
    agent = 'benchmarks/configs/ppo.json'
    environment = 'benchmarks/configs/cartpole.json'

    def server(port):
        Environment.create(environment=environment,
                           remote='socket-server',
                           port=port)

    server1 = Thread(target=server, kwargs=dict(port=65432))
    server2 = Thread(target=server, kwargs=dict(port=65433))
    server1.start()
    server2.start()

    runner = Runner(agent=agent,
                    num_parallel=2,
                    remote='socket-client',
                    host='127.0.0.1',
                    port=65432)
    runner.run(num_episodes=100)  # optional: batch_agent_calls=True
    runner.close()

    server1.join()
    server2.join()
Exemplo n.º 9
0
    def test_quickstart(self):
        self.start_tests(name='quickstart')

        # ====================

        # Create an OpenAI-Gym environment
        environment = Environment.create(environment='gym',
                                         level='CartPole-v1')

        # Create a PPO agent
        agent = Agent.create(
            agent='ppo',
            environment=environment,
            # Automatically configured network
            network='auto',
            # Optimization
            batch_size=10,
            update_frequency=2,
            learning_rate=1e-3,
            subsampling_fraction=0.2,
            optimization_steps=5,
            # Reward estimation
            likelihood_ratio_clipping=0.2,
            discount=0.99,
            estimate_terminal=False,
            # Critic
            critic_network='auto',
            critic_optimizer=dict(optimizer='adam',
                                  multi_step=10,
                                  learning_rate=1e-3),
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Start the runner
        runner.run(num_episodes=50)
        runner.close()

        # ====================

        self.finished_test()
    def train_agent(self,
                    train_steps,
                    progress_bar=False,
                    optimizer='adam',
                    **kwargs):
        """
        args:
            train_steps(int): number of steps to train the agent

        """
        if self.agent is None:
            if optimizer == 'adam':
                self.agent = Agent.create(agent=self.agent_config,
                                          environment=self.tf_env)
            elif optimizer == 'rmsprop':
                pass

        runner = Runner(agent=self.agent, environment=self.tf_env, **kwargs)
        runner.run(num_timesteps=train_steps, use_tqdm=progress_bar)
        if self.logger:
            runner.agent.save(directory=self.save_dir,
                              filename=self.model_name)
        runner.close
Exemplo n.º 11
0
    def test_execution(self):
        self.start_tests(name='getting-started-execution')

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=10)
        runner.run(num_episodes=10)
        runner.run(num_episodes=5, evaluation=True)
        runner.close()
        self.finished_test()

        # Create agent and environment
        environment = Environment.create(
            environment='test/data/environment.json', max_episode_timesteps=10)
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)

        # Train for 200 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(5):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               evaluation=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward

        sum_rewards / 100

        # Close agent and environment
        agent.close()
        environment.close()

        self.finished_test()
Exemplo n.º 12
0
def main():
    # OpenAI-Gym environment specification
    environment = dict(environment='gym', level='CartPole-v1')
    # or: environment = Environment.create(
    #         environment='gym', level='CartPole-v1', max_episode_timesteps=500)

    # PPO agent specification
    agent = dict(
        agent='ppo',
        # Automatically configured network
        network='auto',
        # PPO optimization parameters
        batch_size=10,
        update_frequency=2,
        learning_rate=3e-4,
        multi_step=10,
        subsampling_fraction=0.33,
        # Reward estimation
        likelihood_ratio_clipping=0.2,
        discount=0.99,
        predict_terminal_values=False,
        reward_processing=None,
        # Baseline network and optimizer
        baseline=dict(type='auto', size=32, depth=1),
        baseline_optimizer=dict(optimizer='adam',
                                learning_rate=1e-3,
                                multi_step=10),
        # Regularization
        l2_regularization=0.0,
        entropy_regularization=0.0,
        # Preprocessing
        state_preprocessing='linear_normalization',
        # Exploration
        exploration=0.0,
        variable_noise=0.0,
        # Default additional config values
        config=None,
        # Save agent every 10 updates and keep the 5 most recent checkpoints
        saver=dict(directory='model', frequency=10, max_checkpoints=5),
        # Log all available Tensorboard summaries
        summarizer=dict(directory='summaries', summaries='all'),
        # Do not record agent-environment interaction trace
        recorder=None)
    # or: Agent.create(agent='ppo', environment=environment, ...)
    # with additional argument "environment" and, if applicable, "parallel_interactions"

    # Initialize the runner
    runner = Runner(agent=agent,
                    environment=environment,
                    max_episode_timesteps=500)

    # Train for 200 episodes
    runner.run(num_episodes=200)
    runner.close()
Exemplo n.º 13
0
def local_vectorized():
    """
    Train agent on experience collected in parallel from one vectorized CartPole environment.

    Typical use case:
        time for vectorized environment < time for sequential execution
    """
    agent = 'benchmarks/configs/ppo.json'
    environment = 'custom_cartpole'
    runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500, num_parallel=4)
    runner.run(num_episodes=100)
    runner.close()
Exemplo n.º 14
0
def record_custom_act_function(directory):
    # Trivial custom act function
    def fn_act(states):
        return int(states[2] < 0.0)

    # Record 20 episodes
    runner = Runner(agent=dict(agent=fn_act,
                               recorder=dict(directory=directory)),
                    environment='benchmarks/configs/cartpole.json')
    # or: agent = Agent.create(agent=fn_act, recorder=dict(directory=directory))
    runner.run(num_episodes=20)
    runner.close()
Exemplo n.º 15
0
    def execute(self,
                agent,
                environment,
                num_episodes=None,
                experience_update=None):
        if num_episodes is None:
            num_updates = 2
        else:
            num_updates = None

        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=num_episodes,
                   num_updates=num_updates,
                   use_tqdm=False)
        runner.close()

        # Test experience-update, independent, deterministic
        if experience_update or (experience_update is None
                                 and self.__class__.experience_update):

            for episode in range(
                    num_updates if num_episodes is None else num_episodes):
                episode_states = list()
                episode_internals = list()
                episode_actions = list()
                episode_terminal = list()
                episode_reward = list()
                states = environment.reset()
                internals = agent.initial_internals()
                terminal = False
                deterministic = True
                while not terminal:
                    episode_states.append(states)
                    episode_internals.append(internals)
                    actions, internals = agent.act(states=states,
                                                   internals=internals,
                                                   independent=True,
                                                   deterministic=deterministic)
                    deterministic = not deterministic
                    episode_actions.append(actions)
                    states, terminal, reward = environment.execute(
                        actions=actions)
                    episode_terminal.append(terminal)
                    episode_reward.append(reward)
                agent.experience(states=episode_states,
                                 internals=episode_internals,
                                 actions=episode_actions,
                                 terminal=episode_terminal,
                                 reward=episode_reward)
                agent.update()

        self.finished_test()
Exemplo n.º 16
0
def local():
    """
    Train agent on experience collected in parallel from 4 local CartPole environments.

    Typical use case:
        time for batched agent.act() ~ time for agent.act() > time for environment.execute()
    """
    agent = 'benchmarks/configs/ppo.json'
    environment = 'benchmarks/configs/cartpole.json'
    runner = Runner(agent=agent, environment=environment, num_parallel=4)
    # Batch act/observe calls to agent (otherwise essentially equivalent to single environment)
    runner.run(num_episodes=100, batch_agent_calls=True)
    runner.close()
Exemplo n.º 17
0
def main():
    # Train agent
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    runner = Runner(agent='benchmarks/configs/ppo.json',
                    environment=environment)
    runner.run(num_episodes=100)

    # Save agent SavedModel
    runner.agent.save(directory='saved-model', format='saved-model')
    runner.close()

    # Model serving, potentially using different programming language etc
    # (For regular model saving and loading within Python, see save_load_agent.py example)

    # Load agent SavedModel
    agent = tf.saved_model.load(export_dir='saved-model')

    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(100):
        states = environment.reset()

        # Required in case of internal states:
        # internals = agent.initial_internals()
        # internals = recursive_map(batch, internals)

        terminal = False
        while not terminal:

            states = batch(states)
            # Required in case of nested states:
            # states = recursive_map(batch, states)

            auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool))
            deterministic = True

            actions = agent.act(states, auxiliaries, deterministic)
            # Required in case of internal states:
            # actions_internals = agent.act(states, internals, auxiliaries, deterministic)
            # actions, internals = actions_internals['actions'], actions_internals['internals']

            actions = unbatch(actions)
            # Required in case of nested actions:
            # actions = recursive_map(unbatch, actions)

            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward

    print('Mean evaluation return:', sum_rewards / 100.0)
    environment.close()
Exemplo n.º 18
0
def multiprocessing():
    """
    Train agent on experience collected in parallel from 4 CartPole environments running in
    separate processes.

    Typical use case:
        (a) time for batched agent.act() ~ time for agent.act()
                        > time for environment.execute() + remote communication
            --> batch_agent_calls = True
        (b) time for environment.execute() > time for agent.act() + process communication
            --> batch_agent_calls = False
    """
    agent = 'benchmarks/configs/ppo.json'
    environment = 'benchmarks/configs/cartpole.json'
    runner = Runner(agent=agent, environment=environment, num_parallel=4, remote='multiprocessing')
    runner.run(num_episodes=100, batch_agent_calls=True)  # optional: batch_agent_calls=True
    runner.close()
Exemplo n.º 19
0
def write_custom_recording_file(directory):
    # Start recording traces after 80 episodes -- by then, the environment is solved
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=80)
    runner.close()

    # Record 20 episodes
    for episode in range(20):

        # Record episode experience
        episode_states = list()
        episode_actions = list()
        episode_terminal = list()
        episode_reward = list()

        # Evaluation episode
        states = environment.reset()
        terminal = False
        while not terminal:
            episode_states.append(states)
            actions = agent.act(states=states,
                                independent=True,
                                deterministic=True)
            episode_actions.append(actions)
            states, terminal, reward = environment.execute(actions=actions)
            episode_terminal.append(terminal)
            episode_reward.append(reward)

        # Write recorded episode trace to npz file
        np.savez_compressed(file=os.path.join(
            directory, 'trace-{:09d}.npz'.format(episode)),
                            states=np.stack(episode_states, axis=0),
                            actions=np.stack(episode_actions, axis=0),
                            terminal=np.stack(episode_terminal, axis=0),
                            reward=np.stack(episode_reward, axis=0))
Exemplo n.º 20
0
    def test_pretrain(self):
        # FEATURES.MD
        self.start_tests(name='pretrain')

        def fn_act(states):
            return int(states[2] >= 0.0)

        with TemporaryDirectory() as directory:
            runner = Runner(agent=dict(agent=fn_act,
                                       recorder=dict(directory=directory)),
                            environment='benchmarks/configs/cartpole.json')
            # or: agent = Agent.create(agent=fn_act, recorder=dict(directory='traces'))
            runner.run(num_episodes=10)
            runner.close()

            files = os.listdir(path=directory)
            self.assertEqual(len(files), 10)
            self.assertTrue(
                all(
                    file.startswith('trace-') and file.endswith('.npz')
                    for file in files))

        self.finished_test()
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(description='Tensorforce runner')
    # Agent arguments
    parser.add_argument(
        '-a',
        '--agent',
        type=str,
        default=None,
        help='Agent (name, configuration JSON file, or library module)')
    parser.add_argument('-c',
                        '--checkpoints',
                        type=str,
                        default=None,
                        help='TensorFlow checkpoints directory')
    parser.add_argument('-s',
                        '--summaries',
                        type=str,
                        default=None,
                        help='TensorBoard summaries directory')
    parser.add_argument('--recordings',
                        type=str,
                        default=None,
                        help='Traces recordings directory')
    # Environment arguments
    parser.add_argument(
        '-e',
        '--environment',
        type=str,
        default=None,
        help='Environment (name, configuration JSON file, or library module)')
    parser.add_argument(
        '-l',
        '--level',
        type=str,
        default=None,
        help='Level or game id, like `CartPole-v1`, if supported')
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help='Maximum number of timesteps per episode')
    parser.add_argument(
        '--visualize',
        action='store_true',
        help='Visualize agent--environment interaction, if supported')
    parser.add_argument(
        '--visualize-directory',
        type=str,
        default=None,
        help=
        'Directory to store videos of agent--environment interaction, if supported'
    )
    parser.add_argument(
        '--import-modules',
        type=str,
        default=None,
        help='Import comma-separated modules required for environment')
    # Parallel execution arguments
    parser.add_argument(
        '--num-parallel',
        type=int,
        default=None,
        help='Number of environment instances to execute in parallel')
    parser.add_argument(
        '--batch-agent-calls',
        action='store_true',
        help='Batch agent calls for parallel environment execution')
    parser.add_argument(
        '--sync-timesteps',
        action='store_true',
        help='Synchronize parallel environment execution on timestep-level')
    parser.add_argument(
        '--sync-episodes',
        action='store_true',
        help='Synchronize parallel environment execution on episode-level')
    parser.add_argument(
        '--remote',
        type=str,
        choices=('multiprocessing', 'socket-client', 'socket-server'),
        default=None,
        help=
        'Communication mode for remote environment execution of parallelized'
        'environment execution')
    parser.add_argument('--blocking',
                        action='store_true',
                        help='Remote environments should be blocking')
    parser.add_argument(
        '--host',
        type=str,
        default=None,
        help=
        'Socket server hostname(s) or IP address(es), single value or comma-separated list'
    )
    parser.add_argument(
        '--port',
        type=str,
        default=None,
        help=
        'Socket server port(s), single value or comma-separated list, increasing sequence if'
        'single host and port given')
    # Runner arguments
    parser.add_argument(
        '-v',
        '--evaluation',
        action='store_true',
        help='Run environment (last if multiple) in evaluation mode')
    parser.add_argument('-n',
                        '--episodes',
                        type=int,
                        default=None,
                        help='Number of episodes')
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help='Number of timesteps')
    parser.add_argument('-u',
                        '--updates',
                        type=int,
                        default=None,
                        help='Number of agent updates')
    parser.add_argument(
        '--mean-horizon',
        type=int,
        default=1,
        help=
        'Number of episodes progress bar values and evaluation score are averaged over'
    )
    parser.add_argument(
        '--save-best-agent',
        type=str,
        default=None,
        help=
        'Directory to save the best version of the agent according to the evaluation score'
    )
    # Logging arguments
    parser.add_argument('-r',
                        '--repeat',
                        type=int,
                        default=1,
                        help='Number of repetitions')
    parser.add_argument(
        '--path',
        type=str,
        default=None,
        help='Logging path, directory plus filename without extension')
    parser.add_argument('--seaborn', action='store_true', help='Use seaborn')
    args = parser.parse_args()

    if args.import_modules is not None:
        for module in args.import_modules.split(','):
            importlib.import_module(name=module)

    if args.path is None:
        callback = None

    else:
        assert os.path.splitext(args.path)[1] == ''
        assert args.episodes is not None and args.visualize is not None
        rewards = [list() for _ in range(args.episodes)]
        timesteps = [list() for _ in range(args.episodes)]
        seconds = [list() for _ in range(args.episodes)]
        agent_seconds = [list() for _ in range(args.episodes)]

        def callback(r, p):
            rewards[r.episodes - 1].append(float(r.episode_rewards[-1]))
            timesteps[r.episodes - 1].append(int(r.episode_timesteps[-1]))
            seconds[r.episodes - 1].append(float(r.episode_seconds[-1]))
            agent_seconds[r.episodes - 1].append(
                float(r.episode_agent_seconds[-1]))
            return True

    if args.environment is None:
        environment = None
    else:
        environment = dict(environment=args.environment)
    if args.level is not None:
        environment['level'] = args.level
    if args.visualize:
        environment['visualize'] = True
    if args.visualize_directory is not None:
        environment['visualize_directory'] = args.visualize_directory

    if args.host is not None and ',' in args.host:
        args.host = args.host.split(',')
    if args.port is not None and ',' in args.port:
        args.port = [int(x) for x in args.port.split(',')]
    elif args.port is not None:
        args.port = int(args.port)

    if args.remote == 'socket-server':
        Environment.create(environment=environment,
                           max_episode_timesteps=args.max_episode_timesteps,
                           remote=args.remote,
                           port=args.port)
        return

    if args.agent is None:
        assert args.saver is None and args.summarizer is None and args.recorder is None
        agent = None
    else:
        agent = dict(agent=args.agent)
        if args.checkpoints is not None:
            assert 'saver' not in agent
            agent['saver'] = args.checkpoints
        if args.summaries is not None:
            assert 'summarizer' not in agent
            agent['summarizer'] = args.summaries
        if args.recordings is not None:
            assert 'recorder' not in agent
            agent['recorder'] = args.recordings

    for _ in range(args.repeat):
        runner = Runner(agent=agent,
                        environment=environment,
                        max_episode_timesteps=args.max_episode_timesteps,
                        evaluation=args.evaluation,
                        num_parallel=args.num_parallel,
                        remote=args.remote,
                        blocking=args.blocking,
                        host=args.host,
                        port=args.port)
        runner.run(num_episodes=args.episodes,
                   num_timesteps=args.timesteps,
                   num_updates=args.updates,
                   batch_agent_calls=args.batch_agent_calls,
                   sync_timesteps=args.sync_timesteps,
                   sync_episodes=args.sync_episodes,
                   callback=callback,
                   mean_horizon=args.mean_horizon,
                   save_best_agent=args.save_best_agent)
        runner.close()

    if args.path is not None:
        directory = os.path.split(args.path)[0]
        if directory != '' and not os.path.isdir(directory):
            os.makedirs(directory, exist_ok=True)

        with open(args.path + '.json', 'w') as filehandle:
            filehandle.write(
                json.dumps(
                    dict(rewards=rewards,
                         timesteps=timesteps,
                         seconds=seconds,
                         agent_seconds=agent_seconds)))

        if args.seaborn:
            import seaborn as sns
            sns.set()

        xs = np.arange(len(rewards))
        min_rewards = np.amin(rewards, axis=1)
        max_rewards = np.amax(rewards, axis=1)
        median_rewards = np.median(rewards, axis=1)
        plt.plot(xs, median_rewards, color='green', linewidth=2.0)
        plt.fill_between(xs,
                         min_rewards,
                         max_rewards,
                         color='green',
                         alpha=0.4)
        plt.xlabel('episodes')
        plt.ylabel('reward')
        plt.savefig(fname=(args.path + '.png'))
Exemplo n.º 22
0
    def test_single(self):
        self.start_tests(name='single')

        agent = self.agent_spec()
        environment = self.environment_spec()
        runner = Runner(agent=agent, environment=environment)

        # default
        runner.run(num_episodes=3, use_tqdm=False)
        self.finished_test()

        # evaluation
        runner.run(num_episodes=1, use_tqdm=False, evaluation=False)
        self.finished_test()

        # episode callback
        callback_episode_frequency = 2
        self.num_callbacks = 0

        def callback(r, p):
            self.num_callbacks += 1
            self.assertEqual(r.episodes,
                             self.num_callbacks * callback_episode_frequency)

        runner.run(num_episodes=5,
                   callback=callback,
                   callback_episode_frequency=callback_episode_frequency,
                   use_tqdm=False)
        self.finished_test()

        # timestep callback
        callback_timestep_frequency = 3
        self.num_callbacks = 0

        def callback(r, p):
            self.num_callbacks += 1
            self.assertEqual(r.episode_timestep[p],
                             self.num_callbacks * callback_timestep_frequency)

        runner.run(num_episodes=1,
                   callback=callback,
                   callback_timestep_frequency=callback_timestep_frequency,
                   use_tqdm=False)
        self.finished_test()

        # multiple callbacks
        self.is_callback1 = False
        self.is_callback2 = False

        def callback1(r, p):
            self.is_callback1 = True

        def callback2(r, p):
            self.is_callback2 = True

        runner.run(num_episodes=1,
                   callback=[callback1, callback2],
                   callback_timestep_frequency=callback_timestep_frequency,
                   use_tqdm=False)
        runner.close()
        self.finished_test(assertion=(self.is_callback1 and self.is_callback2))
Exemplo n.º 23
0
    def test_unbatched(self):
        self.start_tests(name='unbatched')

        agent = self.agent_spec()
        environment = self.environment_spec()

        # default
        runner = Runner(agent=agent, environment=environment, num_parallel=2)
        runner.run(num_episodes=3, use_tqdm=False)
        runner.close()
        self.finished_test()

        # episode callback
        runner = Runner(agent=agent, environments=[environment, environment])
        callback_episode_frequency = 2
        self.num_callbacks = 0

        def callback(r, p):
            self.num_callbacks += 1
            if self.num_callbacks % 2 == 0:
                self.assertEqual(min(r.episode_timestep), 0)
            self.assertEqual(r.episodes,
                             self.num_callbacks * callback_episode_frequency)

        runner.run(num_episodes=5,
                   callback=callback,
                   callback_episode_frequency=callback_episode_frequency,
                   use_tqdm=False,
                   sync_episodes=True)
        self.finished_test()

        # timestep callback
        callback_timestep_frequency = 3

        def callback(r, p):
            self.assertEqual(
                r.episode_timestep[p] % callback_timestep_frequency, 0)

        runner.run(num_episodes=2,
                   callback=callback,
                   callback_timestep_frequency=callback_timestep_frequency,
                   use_tqdm=False)
        runner.close()
        self.finished_test()

        # evaluation synced
        runner = Runner(agent=agent,
                        environment=environment,
                        num_parallel=2,
                        evaluation=True)
        self.num_evaluations = 0

        def evaluation_callback(r):
            self.num_evaluations += 1

        runner.run(num_episodes=1,
                   use_tqdm=False,
                   evaluation_callback=evaluation_callback,
                   sync_episodes=True)
        self.finished_test(assertion=(self.num_evaluations == 1))

        # evaluation non-synced
        runner.run(num_episodes=1,
                   use_tqdm=False,
                   evaluation_callback=evaluation_callback)
        runner.close()
        self.finished_test(assertion=(self.num_evaluations >= 2))
Exemplo n.º 24
0
        # Add action mask to states dictionary (mask item is "[NAME]_mask", here "action_mask")
        states = dict(state=self.state, action_mask=action_mask)

        return states

    def execute(self, actions):
        # Compute terminal and reward
        terminal = False
        if actions == 1:
            reward = -np.abs(self.state / 5.0 - 1.0)
        else:
            reward = (1 - actions) * (self.state / 5.0 - 1.0)

        # Compute next state and associated action mask
        self.state += actions - 1
        action_mask = np.asarray([self.state > 0, True, self.state < 10])

        # Add action mask to states dictionary (mask item is "[NAME]_mask", here "action_mask")
        states = dict(state=self.state, action_mask=action_mask)

        return states, terminal, reward


if __name__ == '__main__':
    agent = 'benchmarks/configs/ppo.json'
    runner = Runner(agent=agent,
                    environment=EnvironmentWithMasking,
                    max_episode_timesteps=20)
    runner.run(num_episodes=100)
    runner.close()
Exemplo n.º 25
0
    def unittest(self,
                 environment=None,
                 states=None,
                 actions=None,
                 num_episodes=None,
                 experience_update=None,
                 **agent):
        """
        Generic unit-test.
        """
        if environment is None:
            environment = self.environment_spec(states=states, actions=actions)
            max_episode_timesteps = environment.pop(
                'max_episode_timesteps')  # runner argument

        else:
            max_episode_timesteps = self.__class__.max_episode_timesteps

        agent = self.agent_spec(**agent)

        if num_episodes is None:
            num_updates = 2
        else:
            num_updates = None

        runner = Runner(agent=agent,
                        environment=environment,
                        max_episode_timesteps=max_episode_timesteps)
        runner.run(num_episodes=num_episodes,
                   num_updates=num_updates,
                   use_tqdm=False)
        runner.close()

        # Test experience-update, independent, deterministic
        if experience_update or (experience_update is None
                                 and self.__class__.experience_update):
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps)
            agent = Agent.create(agent=agent, environment=environment)
            assert isinstance(agent.model.get_architecture(), str)

            for episode in range(
                    num_updates if num_episodes is None else num_episodes):
                episode_states = list()
                episode_internals = list()
                episode_actions = list()
                episode_terminal = list()
                episode_reward = list()
                states = environment.reset()
                internals = agent.initial_internals()
                terminal = False
                deterministic = True
                while not terminal:
                    episode_states.append(states)
                    episode_internals.append(internals)
                    actions, internals = agent.act(states=states,
                                                   internals=internals,
                                                   independent=True,
                                                   deterministic=deterministic)
                    deterministic = not deterministic
                    episode_actions.append(actions)
                    states, terminal, reward = environment.execute(
                        actions=actions)
                    episode_terminal.append(terminal)
                    episode_reward.append(reward)
                agent.experience(states=episode_states,
                                 internals=episode_internals,
                                 actions=episode_actions,
                                 terminal=episode_terminal,
                                 reward=episode_reward)
                agent.update()

        self.finished_test()
Exemplo n.º 26
0
    def test_load_performance(self):
        self.start_tests(name='load-performance')

        environment = Environment.create(environment='CartPole-v1')

        agent = Agent.load(directory='test/data',
                           filename='ppo-checkpoint',
                           format='checkpoint',
                           environment=environment)
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(
            all(episode_reward == 500.0
                for episode_reward in runner.episode_rewards))
        runner.close()
        agent.close()
        self.finished_test()

        agent = Agent.load(directory='test/data',
                           filename='ppo-checkpoint',
                           format='numpy',
                           environment=environment)
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(
            all(episode_reward == 500.0
                for episode_reward in runner.episode_rewards))
        runner.close()
        agent.close()
        self.finished_test()

        agent = Agent.load(directory='test/data',
                           filename='ppo-checkpoint',
                           format='hdf5',
                           environment=environment)
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(
            all(episode_reward == 500.0
                for episode_reward in runner.episode_rewards))
        runner.close()
        agent.close()
        self.finished_test()

        agent = tf.saved_model.load(export_dir='test/data/ppo-checkpoint')

        # 10 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            episode_reward = 0.0
            while not terminal:
                states = np.expand_dims(states, axis=0)
                auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool))
                actions = agent.act(states, auxiliaries, True)
                actions = actions.numpy().item()
                states, terminal, reward = environment.execute(actions=actions)
                episode_reward += reward
            self.assertEqual(episode_reward, 500.0)

        environment.close()
        self.finished_test()
Exemplo n.º 27
0
    def test_execution(self):
        self.start_tests(name='getting-started-execution')

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=10)
        runner.run(num_episodes=10)
        runner.run(num_episodes=5, evaluation=True)
        runner.close()
        self.finished_test()

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=50,
                        num_parallel=5,
                        remote='multiprocessing')
        runner.run(num_episodes=10)
        runner.close()
        self.finished_test()

        # Create agent and environment
        environment = Environment.create(
            environment='test/data/environment.json', max_episode_timesteps=10)
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)

        # Train for 100 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        # Train for 100 episodes
        for _ in range(10):
            episode_states = list()
            episode_internals = list()
            episode_actions = list()
            episode_terminal = list()
            episode_reward = list()

            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                episode_states.append(states)
                episode_internals.append(internals)
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True)
                episode_actions.append(actions)
                states, terminal, reward = environment.execute(actions=actions)
                episode_terminal.append(terminal)
                episode_reward.append(reward)

            agent.experience(states=episode_states,
                             internals=episode_internals,
                             actions=episode_actions,
                             terminal=episode_terminal,
                             reward=episode_reward)
            agent.update()

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(10):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               deterministic=True,
                                               independent=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward

        print('Mean episode reward:', sum_rewards / 100)

        # Close agent and environment
        agent.close()
        environment.close()

        self.finished_test()
Exemplo n.º 28
0
    def compute(self, config_id, config, budget, working_directory):
        budget = math.log(budget, self.base)
        assert abs(budget - round(budget)) < util.epsilon
        budget = round(budget)
        assert budget < len(self.runs_per_round)
        num_runs = self.runs_per_round[budget]

        update = dict(unit='episodes',
                      batch_size=config['batch_size'],
                      frequency=1)
        policy = dict(network=dict(type='auto', size=64, depth=2, rnn=False))
        optimizer = dict(
            optimizer='adam',
            learning_rate=config['learning_rate'],
            multi_step=config['multi_step'],
            linesearch_iterations=5  # , subsampling_fraction=256
        )

        if config['clipping_value'] > 1.0:
            objective = dict(
                type='policy_gradient',
                importance_sampling=(config['importance_sampling'] == 'yes'))
        else:
            objective = dict(
                type='policy_gradient',
                importance_sampling=(config['importance_sampling'] == 'yes'),
                clipping_value=config['clipping_value'])

        if config['baseline'] == 'no':
            predict_horizon_values = False
            estimate_advantage = False
            predict_action_values = False
            baseline_policy = None
            baseline_optimizer = None
            baseline_objective = None

        elif config['baseline'] == 'same':
            predict_horizon_values = 'early'
            estimate_advantage = (config['estimate_advantage'] == 'yes')
            predict_action_values = False
            baseline_policy = None
            baseline_optimizer = config['baseline_weight']
            baseline_objective = dict(type='value', value='state')

        elif config['baseline'] == 'yes':
            predict_horizon_values = 'early'
            estimate_advantage = (config['estimate_advantage'] == 'yes')
            predict_action_values = False
            baseline_policy = dict(
                network=dict(type='auto', size=64, depth=2, rnn=False))
            baseline_optimizer = config['baseline_weight']
            baseline_objective = dict(type='value', value='state')

        else:
            assert False

        reward_estimation = dict(horizon=config['horizon'],
                                 discount=config['discount'],
                                 predict_horizon_values=predict_horizon_values,
                                 estimate_advantage=estimate_advantage,
                                 predict_action_values=predict_action_values)

        if config['entropy_regularization'] < 1e-5:
            entropy_regularization = 0.0
        else:
            entropy_regularization = config['entropy_regularization']

        agent = dict(policy=policy,
                     memory='recent',
                     update=update,
                     optimizer=optimizer,
                     objective=objective,
                     reward_estimation=reward_estimation,
                     baseline_policy=baseline_policy,
                     baseline_optimizer=baseline_optimizer,
                     baseline_objective=baseline_objective,
                     entropy_regularization=entropy_regularization)

        average_reward = list()
        final_reward = list()
        rewards = list()

        for n in range(num_runs):
            if self.num_parallel is None:
                runner = Runner(
                    agent=agent,
                    environment=self.environment,
                    max_episode_timesteps=self.max_episode_timesteps)
                runner.run(num_episodes=self.num_episodes, use_tqdm=False)
            else:
                runner = Runner(
                    agent=agent,
                    environment=self.environment,
                    max_episode_timesteps=self.max_episode_timesteps,
                    num_parallel=min(self.num_parallel, config['batch_size']),
                    remote='multiprocessing')
                runner.run(num_episodes=self.num_episodes,
                           batch_agent_calls=True,
                           sync_episodes=True,
                           use_tqdm=False)
            runner.close()

            average_reward.append(
                float(np.mean(runner.episode_rewards, axis=0)))
            final_reward.append(
                float(np.mean(runner.episode_rewards[-20:], axis=0)))
            rewards.append(list(runner.episode_rewards))

        mean_average_reward = float(np.mean(average_reward, axis=0))
        mean_final_reward = float(np.mean(final_reward, axis=0))
        loss = -(mean_average_reward + mean_final_reward)

        return dict(loss=loss, info=dict(rewards=rewards))
Exemplo n.º 29
0
    def test_remote_environments(self):
        self.start_tests(name='remote-environments')

        agent = self.agent_spec(require_observe=True,
                                update=dict(unit='episodes', batch_size=1),
                                parallel_interactions=2)
        environment = self.environment_spec()

        runner = Runner(agent=agent,
                        environment=environment,
                        num_parallel=2,
                        remote='multiprocessing')
        runner.run(num_episodes=self.__class__.num_episodes, use_tqdm=False)
        runner.close()
        self.finished_test()

        def server(port):
            Environment.create(environment=environment,
                               remote='socket-server',
                               port=port)

        server1 = Thread(target=server, kwargs=dict(port=65432))
        server2 = Thread(target=server, kwargs=dict(port=65433))
        server1.start()
        server2.start()
        runner = Runner(agent=agent,
                        num_parallel=2,
                        remote='socket-client',
                        host='127.0.0.1',
                        port=65432)
        runner.run(num_episodes=self.__class__.num_episodes, use_tqdm=False)
        runner.close()
        server1.join()
        server2.join()

        self.finished_test()
Exemplo n.º 30
0
os.remove('test/data/ppo-checkpoint-1.data-00000-of-00001')
os.remove('test/data/ppo-checkpoint-1.index')
os.remove('test/data/ppo-checkpoint.json')
os.remove('test/data/ppo-checkpoint.npz')
os.remove('test/data/ppo-checkpoint.hdf5')

os.rmdir('test/data/ppo-checkpoint/assets')
os.remove('test/data/ppo-checkpoint/variables/variables.data-00000-of-00001')
os.remove('test/data/ppo-checkpoint/variables/variables.index')
os.rmdir('test/data/ppo-checkpoint/variables')
os.remove('test/data/ppo-checkpoint/saved_model.pb')
os.rmdir('test/data/ppo-checkpoint')

runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json',
                           config=dict(device='CPU'),
                           recorder=dict(directory='test/data/ppo-traces',
                                         start=80)),
                environment='benchmarks/configs/cartpole.json')
runner.run(num_episodes=100)
runner.agent.save(directory='test/data',
                  filename='ppo-checkpoint',
                  format='checkpoint')
runner.agent.save(directory='test/data',
                  filename='ppo-checkpoint',
                  format='numpy')
runner.agent.save(directory='test/data',
                  filename='ppo-checkpoint',
                  format='hdf5')
runner.agent.save(directory='test/data',
                  filename='ppo-checkpoint',
                  format='saved-model')