示例#1
0
    def test_environment(self):
        self.start_tests(name='getting-started-environment')

        environment = Environment.create(environment='gym',
                                         level='CartPole',
                                         max_episode_timesteps=50)
        self.finished_test()

        environment = Environment.create(environment='gym',
                                         level='CartPole-v1')
        self.finished_test()

        environment = Environment.create(
            environment='test/data/environment.json', max_episode_timesteps=50)
        self.finished_test()

        environment = Environment.create(
            environment='test.data.custom_env.CustomEnvironment',
            max_episode_timesteps=10)
        self.finished_test()

        from test.data.custom_env import CustomEnvironment
        environment = Environment.create(environment=CustomEnvironment,
                                         max_episode_timesteps=10)
        self.finished_test()
示例#2
0
 def server(port):
     Environment.create(environment=environment,
                        max_episode_timesteps=5,
                        remote='socket-server',
                        port=port,
                        states=self.__class__.states,
                        actions=self.__class__.actions,
                        min_timesteps=self.__class__.min_timesteps)
示例#3
0
    def prepare(
            self,
            # general environment
            environment=None,
            max_episode_timesteps=None,
            # unit-test environment
            min_timesteps=None,
            states=None,
            actions=None,
            # exclude action types
            exclude_bool_action=False,
            exclude_int_action=False,
            exclude_float_action=False,
            exclude_bounded_action=False,
            # agent
            require_observe=False,
            require_all=False,
            **agent):
        """
        Generic unit-test preparation.
        """
        Layer.layers = None

        if environment is None:
            environment = self.environment_spec(
                max_episode_timesteps=max_episode_timesteps,
                min_timesteps=min_timesteps,
                states=states,
                actions=actions,
                exclude_bool_action=exclude_bool_action,
                exclude_int_action=exclude_int_action,
                exclude_float_action=exclude_float_action,
                exclude_bounded_action=exclude_bounded_action)
            environment = Environment.create(environment=environment)

        elif min_timesteps is None:
            if max_episode_timesteps is None:
                max_episode_timesteps = self.__class__.max_episode_timesteps

            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps)

        else:
            raise TensorforceError.unexpected()

        agent = self.agent_spec(require_observe=require_observe,
                                require_all=require_all,
                                **agent)

        agent = Agent.create(agent=agent, environment=environment)

        return agent, environment
示例#4
0
 def setup(self, dbars: Any) -> Any:
     trainingEnvironment = Environment.create(
         environment=TradingEnvironment(dbars),
     )
     self.agent = Agent.create(
         agent=PPOAgent,
         environment=trainingEnvironment,  # alternatively: states, actions, (max_episode_timesteps)
         update=dict(
             unit='timesteps', 
             batch_size=64
         ),
         network="auto",
         ## exploration=?,
         reward_estimation=dict(
             horizon=20
             # discount=?,
         ),
         learning_rate=3e-4,
         # likelihood_ratio_clipping=?,
         # subsampling_fraction=?,
         # multi_step=?
         summarizer=dict(
             directory='./tensorboard/'
         )
     )
     self.agent.save(directory='model-numpy', format='checkpoint', append='episodes')
     ## Train!
     runner = Runner(self.agent, environment=trainingEnvironment)
     runner.run(
         num_episodes=10000,
         save_best_agent='./best-agent/'
     )
     trainingEnvironment.close()
     ## Prepare agent for trading
     self.internal_state = self.agent.initial_internals()
示例#5
0
    def test_agent(self):
        self.start_tests(name='getting-started-agent')

        environment = Environment.create(environment='gym',
                                         level='CartPole',
                                         max_episode_timesteps=50)
        self.finished_test()

        agent = Agent.create(agent='tensorforce',
                             environment=environment,
                             update=64,
                             optimizer=dict(optimizer='adam',
                                            learning_rate=1e-3),
                             objective='policy_gradient',
                             reward_estimation=dict(horizon=20))
        self.finished_test()

        agent = Agent.create(agent='ppo',
                             environment=environment,
                             batch_size=10,
                             learning_rate=1e-3)
        self.finished_test()

        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)
        self.finished_test()
示例#6
0
def main():
    # Record experience traces
    record_ppo_config(directory='ppo-traces')
    # Alternatively:
    # record_custom_act_function(directory='ppo-traces')
    # write_custom_recording_file(directory='ppo-traces')

    # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
    # experience of one episode to the agent and subsequently perform one update
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)
    agent.pretrain(directory='ppo-traces',
                   num_iterations=30,
                   num_traces=1,
                   num_updates=1)

    # Evaluate the pretrained agent
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()

    # Close agent and environment
    agent.close()
    environment.close()
def main():
    # Start recording traces after the first 100 episodes -- by then, the agent
    # has solved the environment
    runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json',
                               recorder=dict(directory='ppo-traces',
                                             start=80)),
                    environment='benchmarks/configs/cartpole.json')
    runner.run(num_episodes=100)
    runner.close()

    # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
    # experience of one episode to the agent and subsequently perform one update
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)
    agent.pretrain(directory='ppo-traces',
                   num_iterations=30,
                   num_traces=1,
                   num_updates=1)

    # Evaluate the pretrained agent
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()

    # Close agent and environment
    agent.close()
    environment.close()
示例#8
0
def get_agent_and_runner(max_timesteps=EPISODE_MAX_LENGTH):
    max_timesteps = EPISODE_MAX_LENGTH if max_timesteps is None else max_timesteps
    # OpenAI-Gym environment specification
    gym_environment = gym.make(LEVEL, render=True)
    gym_environment = TimeLimit(gym_environment.unwrapped,
                                max_episode_steps=max_timesteps)
    # gym_environment = Monitor(gym_environment, RECORD_DICT, force=True)

    environment = Environment.create(
        environment=gym_environment,
        max_episode_timesteps=gym_environment.spec.max_episode_steps,
    )

    agent = Agent.create(
        agent='a2c',
        environment=environment,
        # parallel_interactions=PARALLEL,
        # Automatically configured network
        # network='auto',
        network=[
            dict(type='dense', size=256, activation='tanh'),
            dict(type='dense', size=256, activation='tanh'),
            dict(type='dense', size=256, activation='tanh'),
        ],
        # AC optimization parameters
        batch_size=256,
        update_frequency=2,
        learning_rate=0.001,
        # Reward estimation
        discount=0.99,
        predict_terminal_values=False,
        # Regularization
        l2_regularization=1.0,
        entropy_regularization=0.0,
        # Preprocessing
        state_preprocessing='linear_normalization',
        reward_preprocessing=None,
        # Exploration
        exploration=0.3,
        variable_noise=0.2,
        # Default additional config values
        config=None,
        # Save agent every 10 updates and keep the 5 most recent checkpoints
        saver=dict(directory=MODEL_DICT, frequency=10, max_checkpoints=5),
        # Log all available Tensorboard summaries
        summarizer=dict(directory=SUMMARY_DICT, summaries='all'),
        # Do not record agent-environment interaction trace
        recorder=None  # RECORD_DICT
    )

    # Initialize the runner
    runner = Runner(
        agent=agent,
        environment=environment,
        max_episode_timesteps=gym_environment.spec.max_episode_steps,
        # num_parallel=PARALLEL,
        # remote="multiprocessing"
    )

    return agent, runner
示例#9
0
    def test_quickstart(self):
        self.start_tests(name='quickstart')

        # ====================

        # Create an OpenAI-Gym environment
        environment = Environment.create(environment='gym',
                                         level='CartPole-v1')

        # Create a PPO agent
        agent = Agent.create(
            agent='ppo',
            environment=environment,
            # Automatically configured network
            network='auto',
            # Optimization
            batch_size=10,
            update_frequency=2,
            learning_rate=1e-3,
            subsampling_fraction=0.2,
            optimization_steps=5,
            # Reward estimation
            likelihood_ratio_clipping=0.2,
            discount=0.99,
            estimate_terminal=False,
            # Critic
            critic_network='auto',
            critic_optimizer=dict(optimizer='adam',
                                  multi_step=10,
                                  learning_rate=1e-3),
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Start the runner
        runner.run(num_episodes=50)
        runner.close()

        # ====================

        self.finished_test()
示例#10
0
    def prepare(self, environment=None, states=None, actions=None, **agent):
        """
        Generic unit-test preparation.
        """
        if environment is None:
            environment = self.environment_spec(states=states, actions=actions)
            environment = Environment.create(environment=environment)

        else:
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=self.__class__.max_episode_timesteps)

        agent = self.agent_spec(**agent)

        agent = Agent.create(agent=agent, environment=environment)

        return agent, environment
示例#11
0
    def test_load_performance(self):
        self.start_tests(name='load-performance')

        environment = Environment.create(environment='CartPole-v1')

        agent = dict(directory='test/data',
                     filename='ppo-checkpoint',
                     format='checkpoint')
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(
            all(episode_return == 500.0
                for episode_return in runner.episode_returns))
        runner.close()
        self.finished_test()

        agent = dict(directory='test/data',
                     filename='ppo-checkpoint',
                     format='numpy')
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(
            all(episode_return == 500.0
                for episode_return in runner.episode_returns))
        runner.close()
        self.finished_test()

        agent = dict(directory='test/data',
                     filename='ppo-checkpoint',
                     format='hdf5')
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(
            all(episode_return == 500.0
                for episode_return in runner.episode_returns))
        runner.close()
        self.finished_test()

        agent = tf.saved_model.load(export_dir='test/data/ppo-checkpoint')

        # 10 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            episode_return = 0.0
            while not terminal:
                states = np.expand_dims(states, axis=0)
                auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool))
                actions = agent.act(states, auxiliaries, True)
                actions = actions.numpy().item()
                states, terminal, reward = environment.execute(actions=actions)
                episode_return += reward
            self.assertEqual(episode_return, 500.0)

        environment.close()
        self.finished_test()
示例#12
0
    def prepare(self, environment=None, states=None, actions=None, **agent):
        """
        Generic unit-test preparation.
        """
        if environment is None:
            environment = self.environment_spec(states=states, actions=actions)
            environment = Environment.create(environment=environment)

        else:
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=self.__class__.max_episode_timesteps)

        agent = self.agent_spec(**agent)

        agent = Agent.create(agent=agent, environment=environment)
        assert agent.__class__.__name__ in ('ConstantAgent', 'RandomAgent') or \
            isinstance(agent.model.get_architecture(), str)

        return agent, environment
示例#13
0
def main():
    num_parallel = 8
    environment = Environment.create(environment='custom_cartpole',
                                     max_episode_timesteps=500)
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment,
                         parallel_interactions=num_parallel)

    # Train for 100 episodes
    for episode in range(0, 100, num_parallel):

        # Episode using act and observe
        parallel, states = environment.reset(num_parallel=num_parallel)
        terminal = (parallel < 0)  # all false
        sum_rewards = 0.0
        num_updates = 0
        while not terminal.all():
            actions = agent.act(states=states, parallel=parallel)
            next_parallel, states, terminal, reward = environment.execute(
                actions=actions)
            num_updates += agent.observe(terminal=terminal,
                                         reward=reward,
                                         parallel=parallel)
            parallel = next_parallel
            sum_rewards += reward.sum()
        print('Episode {}: return={} updates={}'.format(
            episode, sum_rewards / num_parallel, num_updates))

    # Evaluate for 100 episodes
    num_parallel = 4
    num_episodes = 100
    sum_rewards = 0.0
    for _ in range(0, num_episodes, num_parallel):
        parallel, states = environment.reset(num_parallel=num_parallel)
        internals = agent.initial_internals()
        internals = [internals for _ in range(num_parallel)]
        terminal = (parallel < 0)  # all false
        while not terminal.all():
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           independent=True,
                                           deterministic=True)
            _, states, terminal, reward = environment.execute(actions=actions)
            internals = [
                internal for internal, term in zip(internals, terminal)
                if not term
            ]
            sum_rewards += reward.sum()
    print('Mean evaluation return:', sum_rewards / num_episodes)

    # Close agent and environment
    agent.close()
    environment.close()
示例#14
0
def main():
    # Train agent
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    runner = Runner(agent='benchmarks/configs/ppo.json',
                    environment=environment)
    runner.run(num_episodes=100)

    # Save agent SavedModel
    runner.agent.save(directory='saved-model', format='saved-model')
    runner.close()

    # Model serving, potentially using different programming language etc
    # (For regular model saving and loading within Python, see save_load_agent.py example)

    # Load agent SavedModel
    agent = tf.saved_model.load(export_dir='saved-model')

    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(100):
        states = environment.reset()

        # Required in case of internal states:
        # internals = agent.initial_internals()
        # internals = recursive_map(batch, internals)

        terminal = False
        while not terminal:

            states = batch(states)
            # Required in case of nested states:
            # states = recursive_map(batch, states)

            auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool))
            deterministic = True

            actions = agent.act(states, auxiliaries, deterministic)
            # Required in case of internal states:
            # actions_internals = agent.act(states, internals, auxiliaries, deterministic)
            # actions, internals = actions_internals['actions'], actions_internals['internals']

            actions = unbatch(actions)
            # Required in case of nested actions:
            # actions = recursive_map(unbatch, actions)

            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward

    print('Mean evaluation return:', sum_rewards / 100.0)
    environment.close()
示例#15
0
    def test_record_and_pretrain(self):
        self.start_tests(name='record-and-pretrain')

        with TemporaryDirectory() as directory:

            # ====================

            # Start recording traces after the first 100 episodes -- by then, the agent
            # has solved the environment
            runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json',
                                       recorder=dict(directory=directory,
                                                     start=8)),
                            environment='benchmarks/configs/cartpole.json')
            runner.run(num_episodes=10)
            runner.close()

            # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
            # experience of one episode to the agent and subsequently perform one update
            environment = Environment.create(
                environment='benchmarks/configs/cartpole.json')
            agent = Agent.create(agent='benchmarks/configs/ppo.json',
                                 environment=environment)
            agent.pretrain(directory='test/data/ppo-traces',
                           num_iterations=30,
                           num_traces=1,
                           num_updates=1)

            # Evaluate the pretrained agent
            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=10, evaluation=True)
            self.assertTrue(
                all(episode_reward == 500.0
                    for episode_reward in runner.episode_rewards))
            runner.close()

            # Close agent and environment
            agent.close()
            environment.close()

            # ====================

            files = sorted(os.listdir(path=directory))
            self.assertEqual(len(files), 2)
            self.assertTrue(
                all(
                    file.startswith('trace-')
                    and file.endswith('0000000{}.npz'.format(n))
                    for n, file in enumerate(files, start=8)))

        self.finished_test()
示例#16
0
    def test_act_observe(self):
        self.start_tests(name='act-observe')

        # ====================

        environment = Environment.create(
            environment='benchmarks/configs/cartpole.json')
        agent = Agent.create(agent='benchmarks/configs/ppo.json',
                             environment=environment)

        # Train for 100 episodes
        for episode in range(10):

            # Episode using act and observe
            states = environment.reset()
            terminal = False
            sum_reward = 0.0
            num_updates = 0
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                num_updates += agent.observe(terminal=terminal, reward=reward)
                sum_reward += reward
            print('Episode {}: return={} updates={}'.format(
                episode, sum_reward, num_updates))

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(10):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True,
                                               deterministic=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward
        print('Mean evaluation return:', sum_rewards / 100.0)

        # Close agent and environment
        agent.close()
        environment.close()

        # ====================

        self.finished_test()
示例#17
0
    def test_execution(self):
        self.start_tests(name='getting-started-execution')

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=10)
        runner.run(num_episodes=10)
        runner.run(num_episodes=5, evaluation=True)
        runner.close()
        self.finished_test()

        # Create agent and environment
        environment = Environment.create(
            environment='test/data/environment.json', max_episode_timesteps=10)
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)

        # Train for 200 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(5):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               evaluation=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward

        sum_rewards / 100

        # Close agent and environment
        agent.close()
        environment.close()

        self.finished_test()
示例#18
0
    def test_readme(self):
        self.start_tests(name='readme')

        # ====================

        from tensorforce import Agent, Environment

        # Pre-defined or custom environment
        environment = Environment.create(environment='gym',
                                         level='CartPole',
                                         max_episode_timesteps=500)

        # Instantiate a Tensorforce agent
        agent = Agent.create(
            agent='tensorforce',
            environment=
            environment,  # alternatively: states, actions, (max_episode_timesteps)
            memory=1000,
            update=dict(unit='timesteps', batch_size=64),
            optimizer=dict(type='adam', learning_rate=3e-4),
            policy=dict(network='auto'),
            objective='policy_gradient',
            reward_estimation=dict(horizon=20))

        # Train for 300 episodes
        for _ in range(1):

            # Initialize episode
            states = environment.reset()
            terminal = False

            while not terminal:
                # Episode timestep
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        agent.close()
        environment.close()

        # ====================

        self.finished_test()
示例#19
0
def main():
    # OpenAI-Gym environment initialization
    environment = Environment.create(environment='benchmarks/configs/cartpole.json')

    # PPO agent initialization
    agent = Agent.create(
        agent='benchmarks/configs/ppo.json', environment=environment,
        # Option 1: Saver - save agent periodically every 10 updates
        # and keep the 5 most recent checkpoints
        saver=dict(directory='model-checkpoint', frequency=10, max_checkpoints=5),
    )

    # Runner initialization
    runner = Runner(agent=agent, environment=environment)

    # Training
    runner.run(num_episodes=100)
    runner.close()

    # Option 2: Explicit save
    # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model,
    # agent argument saver, specified above, uses 'checkpoint')
    agent.save(directory='model-numpy', format='numpy', append='episodes')

    # Close agent separately, since created separately
    agent.close()

    # Load agent TensorFlow checkpoint
    agent = Agent.load(directory='model-checkpoint', format='checkpoint', environment=environment)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()
    agent.close()

    # Load agent NumPy weights
    agent = Agent.load(directory='model-numpy', format='numpy', environment=environment)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()
    agent.close()

    # Close environment separately, since created separately
    environment.close()
示例#20
0
    def initialize_agent(self):
        # Set up information about the boost pads now that the game is active and the info is available
        self.boost_pad_tracker.initialize_boosts(self.get_field_info())
        if MODEL is not None:
            max_time = 10
            frames_per_sec = 20
            max_timesteps = RLEnvironment.get_max_timesteps(max_time, frames_per_sec)
            self.env = Environment.create(
                environment=KickoffEnvironment,
                max_episode_timesteps=max_timesteps,
                max_time=max_time,
                message_throttle=20,
                frames_per_sec=frames_per_sec,
                input_exclude=[
                    InputOptions.BALL_POSITION_REL,
                    InputOptions.BALL_DIRECTION,
                    InputOptions.CAR_POSITION_REL,
                    InputOptions.CAR_VELOCITY_MAG,
                ],
                output_exclude=[
                    OutputOptions.BOOST,
                    OutputOptions.STEER,
                    OutputOptions.E_BRAKE,
                    OutputOptions.THROTTLE,
                    OutputOptions.ROLL,
                ]
            )

            directory='../learning/training/{0}'.format(MODEL)
            filename='agent'
            agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json') 

            if not os.path.isfile(agent):
                logging_utils.log_warn(os.getcwd(), {})
                raise Exception('Model file doesn\'t exist')
            
            self.agent = Agent.load(
                directory=os.path.abspath(directory),
                environment=self.env,
                format='checkpoint',
            )
            self.env.reset()
示例#21
0
def write_custom_recording_file(directory):
    # Start recording traces after 80 episodes -- by then, the environment is solved
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=80)
    runner.close()

    # Record 20 episodes
    for episode in range(20):

        # Record episode experience
        episode_states = list()
        episode_actions = list()
        episode_terminal = list()
        episode_reward = list()

        # Evaluation episode
        states = environment.reset()
        terminal = False
        while not terminal:
            episode_states.append(states)
            actions = agent.act(states=states,
                                independent=True,
                                deterministic=True)
            episode_actions.append(actions)
            states, terminal, reward = environment.execute(actions=actions)
            episode_terminal.append(terminal)
            episode_reward.append(reward)

        # Write recorded episode trace to npz file
        np.savez_compressed(file=os.path.join(
            directory, 'trace-{:09d}.npz'.format(episode)),
                            states=np.stack(episode_states, axis=0),
                            actions=np.stack(episode_actions, axis=0),
                            terminal=np.stack(episode_terminal, axis=0),
                            reward=np.stack(episode_reward, axis=0))
示例#22
0
    def test_quickstart(self):
        environment = Environment.create(environment='gym',
                                         level='CartPole',
                                         max_episode_timesteps=500)

        agent = Agent.create(
            agent='tensorforce',
            environment=
            environment,  # alternatively: states, actions, (max_episode_timesteps)
            memory=1000,
            update=dict(unit='timesteps', batch_size=32),
            optimizer=dict(type='adam', learning_rate=3e-4),
            policy=dict(network='auto'),
            objective='policy_gradient',
            reward_estimation=dict(horizon=1))

        # Train for a single episode.
        states = environment.reset()
        actions = agent.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)

        self.assertEqual(4, len(states))
        self.assertFalse(terminal)
        self.assertEqual(1, reward)
        dest_folder=data_folder,
    )

    # download latest version of Oxford dataset
    download_csv(LATEST_DATA_URL, "OxCGRT_latest", dest_folder=data_folder)

# number of prescriptions (1 prescription per day)
future_days = 3

# Path to the "standard_predictor/predict.py" from the covid-xprize repo
# the covid-xprize package needs to be installed "pip install -e."
predictor_script_path = "/Users/romainegele/Documents/xPrize/covid-xprize/covid_xprize/standard_predictor/predict.py"

# Instanciate environment and wrap it up in Tensorforce.Environment class
env = Environment.create(CovidEnv(future_days, predictor_script_path,
                                  OXFORD_CSV_PATH),
                         max_episode_timesteps=future_days)

print("ACTION SPACE")
pprint(env.actions())

print("STATE SPACE")
pprint(env.states())

# Create Agent
agent = Agent.create(agent='ppo',
                     environment=env,
                     batch_size=10,
                     learning_rate=1e-3)

# Create a runner
示例#24
0
from tensorforce import Agent, Environment
import matplotlib.pyplot as plt
import numpy as np
import math
import pickle
from tqdm import tqdm
episode_number = 400
average_over = 20
# Pre-defined or custom environment
environment = Environment.create(environment='gym',
                                 level='CartPole-v1',
                                 max_episode_timesteps=1000)
'''
    Actions:
        Type: Discrete(2)
        Num   Action
        0     Push cart to the left
        1     Push cart to the right
    Observation:
        Type: Box(4)
        Num     Observation               Min                     Max
        0       Cart Position             -4.8                    4.8
        1       Cart Velocity             -Inf                    Inf
        2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
        3       Pole Angular Velocity     -Inf                    Inf

    Terminal State:
        self.theta_threshold_radians = 12 * 2 * math.pi / 360
        self.x_threshold = 2.4
'''
# Intialize reward record and set parameters
示例#25
0
#setparameters
num_steps=1000 #update exploration rate over n steps
initial_value=0.9 #initial exploartion rate
decay_rate=0.5 #exploration rate decay rate
set_type='exponential' #set the type of decay linear, exponential
exploration=dict(type=set_type, unit='timesteps',
                 num_steps=num_steps,initial_value=initial_value,
                 decay_rate=decay_rate)

episode_number=10000
evaluation_episode_number=50
average_over=100

# Pre-defined or custom environment
environment = Environment.create(environment='gym', level='Walker2d-v3')
'''
For detailed notes on how to interact with the Mujoco environment, please refer
to note https://bailiping.github.io/Mujoco/

Observation:
    def _get_obs(self):
        qpos = self.sim.data.qpos
        qvel = self.sim.data.qvel
        return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel()

    Num    Observation                                 Min            Max
           rootx(_get_obs states from  root z)          Not Limited
    0      rootz                                        Not Limited
    1      rooty                                        Not Limited
    2      thigh joint                                 -150           0
示例#26
0
def main():
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)

    # Train for 100 episodes
    for episode in range(100):

        # Record episode experience
        episode_states = list()
        episode_internals = list()
        episode_actions = list()
        episode_terminal = list()
        episode_reward = list()

        # Episode using independent-act and agent.intial_internals()
        states = environment.reset()
        internals = agent.initial_internals()
        terminal = False
        sum_reward = 0.0
        while not terminal:
            episode_states.append(states)
            episode_internals.append(internals)
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           independent=True)
            episode_actions.append(actions)
            states, terminal, reward = environment.execute(actions=actions)
            episode_terminal.append(terminal)
            episode_reward.append(reward)
            sum_reward += reward
        print('Episode {}: {}'.format(episode, sum_reward))

        # Feed recorded experience to agent
        agent.experience(states=episode_states,
                         internals=episode_internals,
                         actions=episode_actions,
                         terminal=episode_terminal,
                         reward=episode_reward)

        # Perform update
        agent.update()

    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(100):
        states = environment.reset()
        internals = agent.initial_internals()
        terminal = False
        while not terminal:
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           independent=True,
                                           deterministic=True)
            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward
    print('Mean evaluation return:', sum_rewards / 100.0)

    # Close agent and environment
    agent.close()
    environment.close()
示例#27
0
#setparameters
num_steps = 500  #update exploration rate over n steps
initial_value = 0.95  #initial exploartion rate
decay_rate = 0.5  #exploration rate decay rate
set_type = 'exponential'  #set the type of decay linear, exponential
exploration = dict(type=set_type,
                   unit='timesteps',
                   num_steps=num_steps,
                   initial_value=initial_value,
                   decay_rate=decay_rate)

episode_number = 5000
evaluation_episode_number = 5

# Pre-defined or custom environment
environment = Environment.create(environment='gym', level='Hopper-v3')

length = np.zeros(episode_number)

reward_record_without = []

agent_without = Agent.create(agent='agent.json',
                             environment=environment,
                             exploration=exploration)
states = environment.reset()
terminal = False
print('training agent without boundary')
angle_record = []
for _ in tqdm(range(episode_number)):
    episode_reward = 0
    states = environment.reset()
示例#28
0
def main():
    parser = argparse.ArgumentParser(description='Tensorforce runner')
    # Agent arguments
    parser.add_argument(
        '-a',
        '--agent',
        type=str,
        default=None,
        help='Agent (name, configuration JSON file, or library module)')
    parser.add_argument('-c',
                        '--checkpoints',
                        type=str,
                        default=None,
                        help='TensorFlow checkpoints directory')
    parser.add_argument('-s',
                        '--summaries',
                        type=str,
                        default=None,
                        help='TensorBoard summaries directory')
    parser.add_argument('--recordings',
                        type=str,
                        default=None,
                        help='Traces recordings directory')
    # Environment arguments
    parser.add_argument(
        '-e',
        '--environment',
        type=str,
        default=None,
        help='Environment (name, configuration JSON file, or library module)')
    parser.add_argument(
        '-l',
        '--level',
        type=str,
        default=None,
        help='Level or game id, like `CartPole-v1`, if supported')
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help='Maximum number of timesteps per episode')
    parser.add_argument(
        '--visualize',
        action='store_true',
        help='Visualize agent--environment interaction, if supported')
    parser.add_argument(
        '--visualize-directory',
        type=str,
        default=None,
        help=
        'Directory to store videos of agent--environment interaction, if supported'
    )
    parser.add_argument(
        '--import-modules',
        type=str,
        default=None,
        help='Import comma-separated modules required for environment')
    # Parallel execution arguments
    parser.add_argument(
        '--num-parallel',
        type=int,
        default=None,
        help='Number of environment instances to execute in parallel')
    parser.add_argument(
        '--batch-agent-calls',
        action='store_true',
        help='Batch agent calls for parallel environment execution')
    parser.add_argument(
        '--sync-timesteps',
        action='store_true',
        help='Synchronize parallel environment execution on timestep-level')
    parser.add_argument(
        '--sync-episodes',
        action='store_true',
        help='Synchronize parallel environment execution on episode-level')
    parser.add_argument(
        '--remote',
        type=str,
        choices=('multiprocessing', 'socket-client', 'socket-server'),
        default=None,
        help=
        'Communication mode for remote environment execution of parallelized'
        'environment execution')
    parser.add_argument('--blocking',
                        action='store_true',
                        help='Remote environments should be blocking')
    parser.add_argument(
        '--host',
        type=str,
        default=None,
        help=
        'Socket server hostname(s) or IP address(es), single value or comma-separated list'
    )
    parser.add_argument(
        '--port',
        type=str,
        default=None,
        help=
        'Socket server port(s), single value or comma-separated list, increasing sequence if'
        'single host and port given')
    # Runner arguments
    parser.add_argument(
        '-v',
        '--evaluation',
        action='store_true',
        help='Run environment (last if multiple) in evaluation mode')
    parser.add_argument('-n',
                        '--episodes',
                        type=int,
                        default=None,
                        help='Number of episodes')
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help='Number of timesteps')
    parser.add_argument('-u',
                        '--updates',
                        type=int,
                        default=None,
                        help='Number of agent updates')
    parser.add_argument(
        '--mean-horizon',
        type=int,
        default=1,
        help=
        'Number of episodes progress bar values and evaluation score are averaged over'
    )
    parser.add_argument(
        '--save-best-agent',
        type=str,
        default=None,
        help=
        'Directory to save the best version of the agent according to the evaluation score'
    )
    # Logging arguments
    parser.add_argument('-r',
                        '--repeat',
                        type=int,
                        default=1,
                        help='Number of repetitions')
    parser.add_argument(
        '--path',
        type=str,
        default=None,
        help='Logging path, directory plus filename without extension')
    parser.add_argument('--seaborn', action='store_true', help='Use seaborn')
    args = parser.parse_args()

    if args.import_modules is not None:
        for module in args.import_modules.split(','):
            importlib.import_module(name=module)

    if args.path is None:
        callback = None

    else:
        assert os.path.splitext(args.path)[1] == ''
        assert args.episodes is not None and args.visualize is not None
        rewards = [list() for _ in range(args.episodes)]
        timesteps = [list() for _ in range(args.episodes)]
        seconds = [list() for _ in range(args.episodes)]
        agent_seconds = [list() for _ in range(args.episodes)]

        def callback(r, p):
            rewards[r.episodes - 1].append(float(r.episode_rewards[-1]))
            timesteps[r.episodes - 1].append(int(r.episode_timesteps[-1]))
            seconds[r.episodes - 1].append(float(r.episode_seconds[-1]))
            agent_seconds[r.episodes - 1].append(
                float(r.episode_agent_seconds[-1]))
            return True

    if args.environment is None:
        environment = None
    else:
        environment = dict(environment=args.environment)
    if args.level is not None:
        environment['level'] = args.level
    if args.visualize:
        environment['visualize'] = True
    if args.visualize_directory is not None:
        environment['visualize_directory'] = args.visualize_directory

    if args.host is not None and ',' in args.host:
        args.host = args.host.split(',')
    if args.port is not None and ',' in args.port:
        args.port = [int(x) for x in args.port.split(',')]
    elif args.port is not None:
        args.port = int(args.port)

    if args.remote == 'socket-server':
        Environment.create(environment=environment,
                           max_episode_timesteps=args.max_episode_timesteps,
                           remote=args.remote,
                           port=args.port)
        return

    if args.agent is None:
        assert args.saver is None and args.summarizer is None and args.recorder is None
        agent = None
    else:
        agent = dict(agent=args.agent)
        if args.checkpoints is not None:
            assert 'saver' not in agent
            agent['saver'] = args.checkpoints
        if args.summaries is not None:
            assert 'summarizer' not in agent
            agent['summarizer'] = args.summaries
        if args.recordings is not None:
            assert 'recorder' not in agent
            agent['recorder'] = args.recordings

    for _ in range(args.repeat):
        runner = Runner(agent=agent,
                        environment=environment,
                        max_episode_timesteps=args.max_episode_timesteps,
                        evaluation=args.evaluation,
                        num_parallel=args.num_parallel,
                        remote=args.remote,
                        blocking=args.blocking,
                        host=args.host,
                        port=args.port)
        runner.run(num_episodes=args.episodes,
                   num_timesteps=args.timesteps,
                   num_updates=args.updates,
                   batch_agent_calls=args.batch_agent_calls,
                   sync_timesteps=args.sync_timesteps,
                   sync_episodes=args.sync_episodes,
                   callback=callback,
                   mean_horizon=args.mean_horizon,
                   save_best_agent=args.save_best_agent)
        runner.close()

    if args.path is not None:
        directory = os.path.split(args.path)[0]
        if directory != '' and not os.path.isdir(directory):
            os.makedirs(directory, exist_ok=True)

        with open(args.path + '.json', 'w') as filehandle:
            filehandle.write(
                json.dumps(
                    dict(rewards=rewards,
                         timesteps=timesteps,
                         seconds=seconds,
                         agent_seconds=agent_seconds)))

        if args.seaborn:
            import seaborn as sns
            sns.set()

        xs = np.arange(len(rewards))
        min_rewards = np.amin(rewards, axis=1)
        max_rewards = np.amax(rewards, axis=1)
        median_rewards = np.median(rewards, axis=1)
        plt.plot(xs, median_rewards, color='green', linewidth=2.0)
        plt.fill_between(xs,
                         min_rewards,
                         max_rewards,
                         color='green',
                         alpha=0.4)
        plt.xlabel('episodes')
        plt.ylabel('reward')
        plt.savefig(fname=(args.path + '.png'))
示例#29
0
    def __init__(
        self, agent, environment=None, max_episode_timesteps=None, num_parallel=None,
        environments=None, evaluation=False, remote=None, blocking=False, host=None, port=None
    ):
        if environment is None and environments is None:
            if remote != 'socket-client':
                raise TensorforceError.required(
                    name='Runner', argument='environment or environments'
                )
            if num_parallel is None:
                raise TensorforceError.required(
                    name='Runner', argument='num_parallel', condition='socket-client remote mode'
                )
            environments = [None for _ in range(num_parallel)]

        elif environment is None:
            if environments is None:
                raise TensorforceError.required(
                    name='Runner', argument='environment or environments'
                )
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(
                    name='Runner', argument='environments', value=environments
                )
            if len(environments) <= 1:
                raise TensorforceError.value(
                    name='Runner', argument='len(environments)', value=len(environments)
                )
            if num_parallel is not None and num_parallel != len(environments):
                raise TensorforceError.value(
                    name='Runner', argument='num_parallel', value=num_parallel,
                    hint='!= len(environments)'
                )
            num_parallel = len(environments)
            environments = list(environments)

        elif num_parallel is None:
            if environments is not None:
                raise TensorforceError.invalid(
                    name='Runner', argument='environments', condition='environment is specified'
                )
            if evaluation:
                raise TensorforceError.invalid(
                    name='Runner', argument='evaluation', condition='single environment'
                )
            num_parallel = 1
            environments = [environment]

        else:
            if not isinstance(num_parallel, int):
                raise TensorforceError.value(
                    name='Runner', argument='num_parallel', dtype=type(num_parallel)
                )
            elif num_parallel < 2:
                raise TensorforceError.value(
                    name='Runner', argument='num_parallel', value=num_parallel, hint='< 2'
                )
            if environments is not None:
                raise TensorforceError.invalid(
                    name='Runner', argument='environments', condition='environment is specified'
                )
            if isinstance(environment, Environment):
                raise TensorforceError.type(
                    name='Runner', argument='environment', dtype=type(environment),
                    condition='num_parallel', hint='is not specification'
                )
            environments = [environment for _ in range(num_parallel)]

        if port is None or isinstance(port, int):
            if isinstance(host, str):
                port = [port + n for n in range(num_parallel)]
            else:
                port = [port for _ in range(num_parallel)]
        else:
            if len(port) != num_parallel:
                raise TensorforceError.value(
                    name='Runner', argument='len(port)', value=len(port), hint='!= num_parallel'
                )
        if host is None or isinstance(host, str):
            host = [host for _ in range(num_parallel)]
        else:
            if len(host) != num_parallel:
                raise TensorforceError.value(
                    name='Runner', argument='len(host)', value=len(host), hint='!= num_parallel'
                )

        self.environments = list()
        self.is_environment_external = isinstance(environments[0], Environment)
        environment = Environment.create(
            environment=environments[0], max_episode_timesteps=max_episode_timesteps,
            remote=remote, blocking=blocking, host=host[0], port=port[0]
        )
        self.is_environment_remote = isinstance(environment, RemoteEnvironment)
        states = environment.states()
        actions = environment.actions()
        self.environments.append(environment)

        for n, environment in enumerate(environments[1:], start=1):
            assert isinstance(environment, Environment) == self.is_environment_external
            environment = Environment.create(
                environment=environment, max_episode_timesteps=max_episode_timesteps,
                remote=remote, blocking=blocking, host=host[n], port=port[n]
            )
            assert isinstance(environment, RemoteEnvironment) == self.is_environment_remote
            assert util.is_equal(x=environment.states(), y=states)
            assert util.is_equal(x=environment.actions(), y=actions)
            self.environments.append(environment)

        self.evaluation = evaluation

        self.is_agent_external = isinstance(agent, Agent)
        if num_parallel - int(self.evaluation) > 1:
            self.agent = Agent.create(
                agent=agent, environment=environment,
                parallel_interactions=(num_parallel - int(self.evaluation))
            )
        else:
            self.agent = Agent.create(agent=agent, environment=environment)
示例#30
0
    def test_execution(self):
        self.start_tests(name='getting-started-execution')

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=10)
        runner.run(num_episodes=10)
        runner.run(num_episodes=5, evaluation=True)
        runner.close()
        self.finished_test()

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=50,
                        num_parallel=5,
                        remote='multiprocessing')
        runner.run(num_episodes=10)
        runner.close()
        self.finished_test()

        # Create agent and environment
        environment = Environment.create(
            environment='test/data/environment.json', max_episode_timesteps=10)
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)

        # Train for 100 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        # Train for 100 episodes
        for _ in range(10):
            episode_states = list()
            episode_internals = list()
            episode_actions = list()
            episode_terminal = list()
            episode_reward = list()

            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                episode_states.append(states)
                episode_internals.append(internals)
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True)
                episode_actions.append(actions)
                states, terminal, reward = environment.execute(actions=actions)
                episode_terminal.append(terminal)
                episode_reward.append(reward)

            agent.experience(states=episode_states,
                             internals=episode_internals,
                             actions=episode_actions,
                             terminal=episode_terminal,
                             reward=episode_reward)
            agent.update()

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(10):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               deterministic=True,
                                               independent=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward

        print('Mean episode reward:', sum_rewards / 100)

        # Close agent and environment
        agent.close()
        environment.close()

        self.finished_test()