Exemplo n.º 1
0
    def prepare(self,
                environment=None,
                min_timesteps=None,
                states=None,
                actions=None,
                exclude_bool_action=False,
                exclude_int_action=False,
                exclude_float_action=False,
                exclude_bounded_action=False,
                require_observe=False,
                require_all=False,
                **agent):
        """
        Generic unit-test preparation.
        """
        Layer.layers = None

        if environment is None:
            if states is None:
                states = deepcopy(self.__class__.states)

            if actions is None:
                actions = deepcopy(self.__class__.actions)
                if exclude_bool_action or self.__class__.exclude_bool_action:
                    actions.pop('bool_action')
                if exclude_int_action or self.__class__.exclude_int_action:
                    actions.pop('int_action')
                if exclude_float_action or self.__class__.exclude_float_action:
                    actions.pop('float_action')
                if exclude_bounded_action or self.__class__.exclude_bounded_action:
                    actions.pop('bounded_action')

            if min_timesteps is None:
                min_timesteps = self.__class__.min_timesteps

            environment = UnittestEnvironment(states=states,
                                              actions=actions,
                                              min_timesteps=min_timesteps)

        elif min_timesteps is not None:
            raise TensorforceError.unexpected()

        environment = Environment.create(environment=environment,
                                         max_episode_timesteps=5)

        for key, value in self.__class__.agent.items():
            if key not in agent:
                agent[key] = value

        if self.__class__.require_all or require_all:
            config = None
        elif self.__class__.require_observe or require_observe:
            config = dict(api_functions=['reset', 'act', 'observe'])
        else:
            config = dict(api_functions=['reset', 'act'])

        agent = Agent.create(agent=agent,
                             environment=environment,
                             config=config)

        return agent, environment
Exemplo n.º 2
0
 def default_agent(self, **kwargs) -> Agent:
     return Agent.create(
         agent='ppo',
         environment=self,
         max_episode_timesteps=kwargs.get('max_episode_timesteps'),
         batch_size=1)
def main():
    # Set random seeds
    tf.random.set_seed(0)
    np.random.seed(0)

    num_models, output_size, val_model_outputs, y_val, test_model_outputs, y_test, avg_model_costs, weight_table = data_loader(
    )
    environment = ModelSelectionEnvironment(num_models, output_size,
                                            val_model_outputs, y_val,
                                            test_model_outputs, y_test,
                                            avg_model_costs)
    #environment = ModelSelectionEnvironment(num_models, output_size, val_model_outputs, y_val, test_model_outputs, y_test, avg_model_costs, weight_table)

    # agent = Agent.create(
    #     agent='ppo', environment=environment,
    #     # Automatically configured network
    #     network='auto',
    #     # Optimization
    #     batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2,
    #     optimization_steps=5,
    #     # Reward estimation
    #     likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False,
    #     # Critic
    #     critic_network='auto',
    #     critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3),
    #     # Preprocessing
    #     preprocessing=None,
    #     # Exploration
    #     exploration=0.0, variable_noise=0.0,
    #     # Regularization
    #     l2_regularization=0.0, entropy_regularization=0.0,
    #     # TensorFlow etc
    #     name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None,
    #     summarizer=None, recorder=None
    # )

    # agent = Agent.create(
    #     agent='ppo', environment=environment, batch_size=10, learning_rate=1e-3
    # )

    agent = Agent.create(
        agent='tensorforce',
        environment=
        environment,  # alternatively: states, actions, (max_episode_timesteps)
        memory=10000,
        update=dict(unit='timesteps', batch_size=64),
        optimizer=dict(type='adam', learning_rate=3e-4),
        policy=dict(network='auto'),
        objective='policy_gradient',
        reward_estimation=dict(horizon=num_models + 1),
        config=dict(seed=0))

    # Load agent from saved directory
    # agent = tf.saved_model.load(export_dir="models/tensorforce")

    time_weights = [x / 10000 for x in avg_model_costs]
    runner(environment,
           agent,
           n_episodes=5000,
           n_episodes_test=y_test.shape[0],
           time_weights=time_weights)
Exemplo n.º 4
0
    def test_record_and_pretrain(self):
        self.start_tests(name='record-and-pretrain')

        with TemporaryDirectory() as directory:

            # ====================

            # Start recording traces after 80 episodes -- by then, the environment is solved
            runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json',
                                       recorder=dict(directory=directory,
                                                     start=8)),
                            environment='benchmarks/configs/cartpole.json')
            runner.run(num_episodes=10)
            runner.close()

            # ====================

            # Trivial custom act function
            def fn_act(states):
                return int(states[2] < 0.0)

            # Record 20 episodes
            runner = Runner(agent=dict(agent=fn_act,
                                       recorder=dict(directory=directory)),
                            environment='benchmarks/configs/cartpole.json')
            # or: agent = Agent.create(agent=fn_act, recorder=dict(directory=directory))
            runner.run(num_episodes=2)
            runner.close()

            # ====================

            # Start recording traces after 80 episodes -- by then, the environment is solved
            environment = Environment.create(
                environment='benchmarks/configs/cartpole.json')
            agent = Agent.create(agent='benchmarks/configs/ppo.json',
                                 environment=environment)
            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=8)
            runner.close()

            # Record 20 episodes
            for episode in range(2, 4):

                # Record episode experience
                episode_states = list()
                episode_actions = list()
                episode_terminal = list()
                episode_reward = list()

                # Evaluation episode
                states = environment.reset()
                terminal = False
                while not terminal:
                    episode_states.append(states)
                    actions = agent.act(states=states,
                                        independent=True,
                                        deterministic=True)
                    episode_actions.append(actions)
                    states, terminal, reward = environment.execute(
                        actions=actions)
                    episode_terminal.append(terminal)
                    episode_reward.append(reward)

                # Write recorded episode trace to npz file
                np.savez_compressed(file=os.path.join(
                    directory, 'trace-{:09d}.npz'.format(episode)),
                                    states=np.stack(episode_states, axis=0),
                                    actions=np.stack(episode_actions, axis=0),
                                    terminal=np.stack(episode_terminal,
                                                      axis=0),
                                    reward=np.stack(episode_reward, axis=0))

            # ====================

            # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
            # experience of one episode to the agent and subsequently perform one update
            environment = Environment.create(
                environment='benchmarks/configs/cartpole.json')
            agent = Agent.create(agent='benchmarks/configs/ppo.json',
                                 environment=environment)
            agent.pretrain(directory=directory,
                           num_iterations=30,
                           num_traces=1,
                           num_updates=1)

            # Evaluate the pretrained agent
            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=10, evaluation=True)
            runner.close()

            # Close agent and environment
            agent.close()
            environment.close()

            # ====================

            # Performance test
            environment = Environment.create(
                environment='benchmarks/configs/cartpole.json')
            agent = Agent.create(agent='benchmarks/configs/ppo.json',
                                 environment=environment)
            agent.pretrain(directory='test/data/ppo-traces',
                           num_iterations=30,
                           num_traces=1,
                           num_updates=1)
            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=10, evaluation=True)
            self.assertTrue(
                all(episode_reward == 500.0
                    for episode_reward in runner.episode_rewards))
            runner.close()
            agent.close()
            environment.close()

            files = sorted(os.listdir(path=directory))
            self.assertEqual(len(files), 6)
            self.assertTrue(
                all(
                    file.startswith('trace-')
                    and file.endswith('0000000{}.npz'.format(n))
                    for n, file in zip([0, 1, 2, 3, 8, 9], files)))

        self.finished_test()
Exemplo n.º 5
0
# Pre-defined or custom environment
# environment = Environment.create(
#     environment='gym', level='CartPole', max_episode_timesteps=500
# )


# environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)
environment = OpenAIGym('LunarLanderContinuous-v2', visualize=False, max_episode_steps=500)
# environment = OpenAIGym('BipedalWalker-v3', visualize=False, max_episode_steps=500)


agent = Agent.create(
    agent='ppo', environment=environment, batch_size=10,
    network=[
        dict(type='dense', size=64),
        dict(type='dense', size=64)
    ],
    learning_rate=1e-3

)


running_score = 0.0
# Train for 300 episodes
for i_epoch in range(50000):
    game_score = 0.0
    # Initialize episode
    states = environment.reset()
    terminal = False

    while not terminal:
Exemplo n.º 6
0
    def __init__(self,
                 agent,
                 environment=None,
                 max_episode_timesteps=None,
                 num_parallel=None,
                 environments=None,
                 evaluation=False,
                 remote=None,
                 blocking=False,
                 host=None,
                 port=None):
        if environment is None and environments is None:
            if remote != 'socket-client':
                raise TensorforceError.required(
                    name='Runner', argument='environment or environments')
            if num_parallel is None:
                raise TensorforceError.required(
                    name='Runner',
                    argument='num_parallel',
                    condition='socket-client remote mode')
            environments = [None for _ in range(num_parallel)]

        elif environment is None:
            if environments is None:
                raise TensorforceError.required(
                    name='Runner', argument='environment or environments')
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(name='Runner',
                                            argument='environments',
                                            value=environments)
            if len(environments) == 0:
                raise TensorforceError.value(name='Runner',
                                             argument='len(environments)',
                                             value=len(environments))
            if num_parallel is not None and num_parallel != len(environments):
                raise TensorforceError.value(name='Runner',
                                             argument='num_parallel',
                                             value=num_parallel,
                                             hint='!= len(environments)')
            num_parallel = len(environments)
            environments = list(environments)

        elif num_parallel is None:
            if environments is not None:
                raise TensorforceError.invalid(
                    name='Runner',
                    argument='environments',
                    condition='environment is specified')
            if evaluation:
                raise TensorforceError.invalid(name='Runner',
                                               argument='evaluation',
                                               condition='single environment')
            num_parallel = 1
            environments = [environment]

        else:
            if not isinstance(num_parallel, int):
                raise TensorforceError.value(name='Runner',
                                             argument='num_parallel',
                                             dtype=type(num_parallel))
            elif num_parallel < 2:
                raise TensorforceError.value(name='Runner',
                                             argument='num_parallel',
                                             value=num_parallel,
                                             hint='< 2')
            if environments is not None:
                raise TensorforceError.invalid(
                    name='Runner',
                    argument='environments',
                    condition='environment is specified')
            if isinstance(environment, Environment):
                raise TensorforceError.value(
                    name='Runner',
                    argument='environment',
                    value=environment,
                    condition='num_parallel',
                    hint=
                    'is Environment instance, but specification dict is required'
                )
            environments = [environment for _ in range(num_parallel)]

        if port is None or isinstance(port, int):
            if isinstance(host, str):
                port = [port + n for n in range(num_parallel)]
            else:
                port = [port for _ in range(num_parallel)]
        else:
            if len(port) != num_parallel:
                raise TensorforceError.value(name='Runner',
                                             argument='len(port)',
                                             value=len(port),
                                             hint='!= num_parallel')
        if host is None or isinstance(host, str):
            host = [host for _ in range(num_parallel)]
        else:
            if len(host) != num_parallel:
                raise TensorforceError.value(name='Runner',
                                             argument='len(host)',
                                             value=len(host),
                                             hint='!= num_parallel')

        self.environments = list()
        self.is_environment_external = isinstance(environments[0], Environment)
        environment = Environment.create(
            environment=environments[0],
            max_episode_timesteps=max_episode_timesteps,
            remote=remote,
            blocking=blocking,
            host=host[0],
            port=port[0])
        self.is_environment_remote = isinstance(environment, RemoteEnvironment)
        states = environment.states()
        actions = environment.actions()
        self.environments.append(environment)
        if remote is None and len(
                environments) > 1 and environment.is_vectorizable():
            self.num_vectorized = num_parallel
            environments = environments[:1]
            if evaluation:
                raise TensorforceError.invalid(
                    name='Runner',
                    argument='evaluation',
                    condition='vectorized environment')
        else:
            self.num_vectorized = None

        for n, environment in enumerate(environments[1:], start=1):
            assert isinstance(environment,
                              Environment) == self.is_environment_external
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                remote=remote,
                blocking=blocking,
                host=host[n],
                port=port[n])
            assert isinstance(environment,
                              RemoteEnvironment) == self.is_environment_remote
            assert util.is_equal(x=environment.states(), y=states)
            assert util.is_equal(x=environment.actions(), y=actions)
            self.environments.append(environment)

        self.evaluation = evaluation

        self.is_agent_external = isinstance(agent, Agent)
        if num_parallel - int(self.evaluation) > 1:
            self.agent = Agent.create(
                agent=agent,
                environment=environment,
                parallel_interactions=(num_parallel - int(self.evaluation)))
        else:
            self.agent = Agent.create(agent=agent, environment=environment)
Exemplo n.º 7
0
    def test_act_observe_vectorized(self):
        self.start_tests(name='act-observe-vectorized')

        # ====================

        num_parallel = 8
        environment = Environment.create(environment='custom_cartpole',
                                         max_episode_timesteps=500)
        agent = Agent.create(agent='benchmarks/configs/ppo.json',
                             environment=environment,
                             parallel_interactions=num_parallel)

        # Train for 100 episodes
        for episode in range(0, 10, num_parallel):

            # Episode using act and observe
            parallel, states = environment.reset(num_parallel=num_parallel)
            terminal = (parallel < 0)  # all false
            sum_rewards = 0.0
            num_updates = 0
            while not terminal.all():
                actions = agent.act(states=states, parallel=parallel)
                next_parallel, states, terminal, reward = environment.execute(
                    actions=actions)
                num_updates += agent.observe(terminal=terminal,
                                             reward=reward,
                                             parallel=parallel)
                parallel = next_parallel
                sum_rewards += reward.sum()
            print('Episode {}: return={} updates={}'.format(
                episode, sum_rewards / num_parallel, num_updates))

        # Evaluate for 100 episodes
        num_parallel = 4
        num_episodes = 10
        sum_rewards = 0.0
        for _ in range(0, num_episodes, num_parallel):
            parallel, states = environment.reset(num_parallel=num_parallel)
            internals = agent.initial_internals()
            internals = [internals for _ in range(num_parallel)]
            terminal = (parallel < 0)  # all false
            while not terminal.all():
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True,
                                               deterministic=True)
                _, states, terminal, reward = environment.execute(
                    actions=actions)
                internals = [
                    internal for internal, term in zip(internals, terminal)
                    if not term
                ]
                sum_rewards += reward.sum()
        print('Mean evaluation return:', sum_rewards / num_episodes)

        # Close agent and environment
        agent.close()
        environment.close()

        # ====================

        self.finished_test()
Exemplo n.º 8
0
    def create_agent(
        self,
        env,
        n_episodes,
        save_frenquency,
        load=False,
    ):
        ########### WORK NEEDED ###########
        ### You need to tweak the Agent ###
        ###################################
        """
        Agent definition. Tweak the Agent's parameters to your convenience

        Use any agent from tensorforce and refer to the documentation for the available hyperparameters :
        -Vanilla Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/vpg.html
        -Proximal Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/ppo.html
        -Trust-Region Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/trpo.html
        -Deterministic Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/dpg.html
        -Deep Q-Network : https://tensorforce.readthedocs.io/en/latest/agents/dqn.html
        -Double DQN : https://tensorforce.readthedocs.io/en/latest/agents/double_dqn.html
        -Dueling DQN : https://tensorforce.readthedocs.io/en/latest/agents/dueling_dqn.html
        -Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/ac.html
        -Advantage Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/a2c.html

        For the network parameters :
        https://tensorforce.readthedocs.io/en/latest/modules/networks.html


        """
        ##### Agent definition ########
        if not (load):
            print(f"INIT AGENT.")
            agent = Agent.create(
                agent="ppo",
                states={
                    'type':
                    'float',
                    'shape': (10, ),
                    'min_value': [
                        -1.000e+00, -2.000e+00, -1.000e+00, -1.000e+00,
                        -1.000e+00, -1.280e+00, -3.400e+00, -9.999e+03,
                        -9.999e+03, -9.999e+03
                    ],
                    'max_value': [
                        1.000e+00, 2.000e+00, 1.000e+00, 1.000e+00, 1.000e+00,
                        1.000e+00, 3.600e+00, 9.999e+03, 9.999e+03, 9.999e+03
                    ]
                },
                actions=dict(
                    gimbal=dict(type='int', shape=1, num_values=5),
                    throttle=dict(type='int', shape=1, num_values=5),
                    side_booster=dict(type='int', shape=1, num_values=5),
                ),
                max_episode_timesteps=100000,
                batch_size=8,
                discount=0.99,
                exploration=0.01,
                #entropy_regularization=1e-3,
                #l2_regularization=1e-3,
                learning_rate=5e-4,
                config=dict(name="ppo_agent_V3"),
                saver=dict(
                    directory="data/checkpoints",
                    frequency=10  # save checkpoint every 10 updates
                ),  # don't change this
                # environment=env,
            )

        else:
            print(f"RELOADING AGENT.")
            agent = Agent.load(directory="data/checkpoints",
                               filename="ppo_agent_V3")
        return agent
Exemplo n.º 9
0
    def __init__(self,
                 agent,
                 environment=None,
                 max_episode_timesteps=None,
                 evaluation=False,
                 num_parallel=None,
                 environments=None,
                 remote=None,
                 blocking=False,
                 host=None,
                 port=None):
        if environment is None and environments is None:
            assert num_parallel is not None and remote == 'socket-client'
            environments = [None for _ in range(num_parallel)]

        elif environment is None:
            assert environments is not None
            assert num_parallel is None or num_parallel == len(environments)
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(name='parallel-runner',
                                            argument='environments',
                                            value=environments)
            elif len(environments) == 0:
                raise TensorforceError.value(name='parallel-runner',
                                             argument='environments',
                                             value=environments)
            num_parallel = len(environments)
            environments = list(environments)

        elif num_parallel is None:
            assert environments is None
            num_parallel = 1
            environments = [environment]

        else:
            assert environments is None
            assert not isinstance(environment, Environment)
            environments = [environment for _ in range(num_parallel)]

        if port is None or isinstance(port, int):
            if isinstance(host, str):
                port = [port + n for n in range(num_parallel)]
            else:
                port = [port for _ in range(num_parallel)]
        else:
            assert len(port) == num_parallel
        if host is None or isinstance(host, str):
            host = [host for _ in range(num_parallel)]
        else:
            assert len(host) == num_parallel

        self.environments = list()
        self.is_environment_external = isinstance(environments[0], Environment)
        environment = Environment.create(
            environment=environments[0],
            max_episode_timesteps=max_episode_timesteps,
            remote=remote,
            blocking=blocking,
            host=host[0],
            port=port[0])
        self.is_environment_remote = isinstance(environment, RemoteEnvironment)
        states = environment.states()
        actions = environment.actions()
        self.environments.append(environment)

        for n, environment in enumerate(environments[1:], start=1):
            assert isinstance(environment,
                              Environment) == self.is_environment_external
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                remote=remote,
                blocking=blocking,
                host=host[n],
                port=port[n])
            assert isinstance(environment,
                              RemoteEnvironment) == self.is_environment_remote
            assert environment.states() == states
            assert environment.actions() == actions
            self.environments.append(environment)

        self.evaluation = evaluation

        self.is_agent_external = isinstance(agent, Agent)
        if num_parallel - int(self.evaluation) > 1:
            self.agent = Agent.create(
                agent=agent,
                environment=environment,
                parallel_interactions=(num_parallel - int(self.evaluation)))
        else:
            self.agent = Agent.create(agent=agent, environment=environment)
Exemplo n.º 10
0
    print(result_vec)
    # Plot the evolution of the agent over the batches
    # plot_multiple(
    #     Series=[result_vec],
    #     labels = ["Reward"],
    #     xlabel = "episodes",
    #     ylabel = "Reward",
    #     title = "Reward vs episodes",
    #     save_fig=True,
    #     path="env",
    #     folder=str(combination),
    #     time=False,
    # )
    #Terminate the agent and the environment
    agent.close()
    environment.close()


# %%
from tensorforce import Agent

environment = CustomEnvironment(25, 25)

# Instantiate a Tensorforce agent
agent = Agent.create(agent="ppo", environment=environment, batch_size=10)

# Call runner
runner(environment, agent, max_step_per_episode=1000, n_episodes=10000)

# %%
Exemplo n.º 11
0
    def __init__(self,
                 agent,
                 environment=None,
                 num_parallel=None,
                 environments=None,
                 max_episode_timesteps=None,
                 evaluation_environment=None,
                 save_best_agent=None):
        self.environments = list()
        if environment is None:
            assert num_parallel is None and environments is not None
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(name='parallel-runner',
                                            argument='environments',
                                            value=environments)
            elif len(environments) == 0:
                raise TensorforceError.value(name='parallel-runner',
                                             argument='environments',
                                             value=environments)
            num_parallel = len(environments)
            environment = environments[0]
            self.is_environment_external = isinstance(environment, Environment)
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps)
            states = environment.states()
            actions = environment.actions()
            self.environments.append(environment)
            for environment in environments[1:]:
                assert isinstance(environment,
                                  Environment) == self.is_environment_external
                environment = Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps)
                assert environment.states() == states
                assert environment.actions() == actions
                self.environments.append(environment)

        else:
            assert num_parallel is not None and environments is None
            assert not isinstance(environment, Environment)
            self.is_environment_external = False
            for _ in range(num_parallel):
                environment = Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps)
                self.environments(environment)

        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.is_eval_environment_external = isinstance(
                evaluation_environment, Environment)
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment,
                max_episode_timesteps=max_episode_timesteps)
            assert self.evaluation_environment.states() == environment.states()
            assert self.evaluation_environment.actions(
            ) == environment.actions()

        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict(parallel_interactions=num_parallel)
        self.agent = Agent.create(agent=agent,
                                  environment=environment,
                                  **kwargs)
        self.save_best_agent = save_best_agent

        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        self.evaluation_rewards = list()
        self.evaluation_timesteps = list()
        self.evaluation_seconds = list()
        self.evaluation_agent_seconds = list()
    subsampling_fraction=0.91,
    likelihood_ratio_clipping=0.09,
    discount=1.0,
    predict_terminal_values='false',
    baseline=dict(type='custom', layers=layers),
    baseline_optimizer=dict(optimizer="adam", learning_rate=1e-3,
                            multi_step=5),
    l2_regularization=0.0,
    entropy_regularization=0.3,
    state_preprocessing='linear_normalization',
    exploration=exploration,
    variable_noise=0.0,
    recorder=None,
    parallel_interactions=1)

agent = Agent.create(agent=customized_agent, environment=environment)
reward_record_without = []
print('training agent without boundary')
for _ in tqdm(range(episode_number)):
    episode_reward = 0
    states = environment.reset()
    terminal = False
    while not terminal:
        actions = agent.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        episode_reward += reward
        agent.observe(terminal=terminal, reward=reward)
    reward_record_without.append(episode_reward)
    print(episode_reward)
temp = np.array(reward_record_without)
reward_record_without_average = moving_average(temp, average_over)
length = np.zeros(episode_number)
measure_length = moving_average(length, average_over)

prohibition_parameter = [0, -5, -10, -15, -20, -25, -30]
prohibition_position = [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99]

reward_record = np.zeros((len(prohibition_position),
                          len(prohibition_parameter), len(measure_length)))
x_threshold = 2.4

for k in range(len(prohibition_position)):
    for i in range(len(prohibition_parameter)):
        record = []
        agent = Agent.create(agent='a2c',
                             environment=environment,
                             batch_size=64,
                             learning_rate=1e-2)
        print(
            'running experiment with boundary position at %s and prohibitive parameter %s'
            % (prohibition_position[k], prohibition_parameter[i]))
        for _ in tqdm(range(episode_number)):
            episode_reward = 0
            states = environment.reset()
            terminal = False
            while not terminal:
                x_position = states[0]
                if x_position >= prohibition_position[k] * x_threshold:
                    episode_reward += prohibition_parameter[i]
                    actions = agent.act(states=states)
                    actions = 0
                elif x_position <= -prohibition_position[k] * x_threshold:
        'min_value': input_low,
        'max_value': input_high
    }

    env = Environment(action_mode, '', ObservationConfig(), False)
    task = env.get_task(
        PutGroceriesInCupboard
    )  # available tasks: EmptyContainer, PlayJenga, PutGroceriesInCupboard, SetTheTable

    len_episode = 30
    explore = 0.6

    agent = Agent.create(
        agent='dqn',
        states=
        states_dict,  # alternatively: states, actions, (max_episode_timesteps)
        actions=actions_dict,
        memory=10000,
        max_episode_timesteps=len_episode,
        exploration=explore)

    obj_pose_sensor = NoisyObjectPoseSensor(env)

    descriptions, obs = task.reset()
    print(descriptions)

    while True:
        obj_poses = obj_pose_sensor.get_poses()
        target_state = list(obj_poses['sugar'])
        target_state[2] += 0.1

        # best_reward = 1/np.linalg.norm(target_state[:3] - obs.gripper_pose[:3])
Exemplo n.º 15
0
from Normal import prohibition_position
from Normal import environment

#training and evaluation with boundary
reward_record_average=np.zeros((len(prohibition_position),len(prohibition_parameter),len(measure_length)))
reward_record=np.zeros((len(prohibition_position),len(prohibition_parameter),episode_number))
evaluation_reward_record=np.zeros((len(prohibition_position),len(prohibition_parameter),evaluation_episode_number))

coach= Agent.load(directory='Walker_RL', format='numpy')
internals = coach.initial_internals()

for k in range(len(prohibition_position)):
    #training
    for i in range(len(prohibition_parameter)):
        record=[]
        agent = Agent.create(agent='agent.json', environment=environment)
        print('training agent with boundary position at %s and prohibitive parameter %s' %(prohibition_position[k],prohibition_parameter[i]))
        for _ in tqdm(range(episode_number)):
            episode_reward=0
            states = environment.reset()
            terminal = False
            while not terminal:
                rootz=states[0]
                actions = agent.act(states=states)
                if abs(1.25-rootz)>=prohibition_position[k]:
                    actions, internals = coach.act(states=states, internals=internals, independent=True, deterministic=True)
                    states, terminal, reward = environment.execute(actions=actions)
                    reward= 0
                    episode_reward+=reward
                    agent.observe(terminal=terminal, reward=reward)
Exemplo n.º 16
0
model_dir = './data/m1/'

num_ep = 100
max_ep_ts = 200

# Pre-defined or custom environment
environment = Environment.create(
    environment='gym', level=env, max_episode_timesteps=max_ep_ts
)

# Instantiate a Tensorforce agent
agent = Agent.create(
    agent='ppo',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    batch_size=4,
    saver=dict(
        directory=(model_dir + 'checkpoints'),
        frequency=2  # save checkpoint every 10 updates
    ),
)


output = []
# Train for 3 episodes with 100 timesteps each
for i in range(num_ep):

    # Initialize episode
    states = environment.reset()
    terminal = False

    rs = []
Exemplo n.º 17
0
    def unittest(self,
                 environment=None,
                 states=None,
                 actions=None,
                 num_episodes=None,
                 experience_update=None,
                 **agent):
        """
        Generic unit-test.
        """
        if environment is None:
            environment = self.environment_spec(states=states, actions=actions)
            max_episode_timesteps = environment.pop(
                'max_episode_timesteps')  # runner argument

        else:
            max_episode_timesteps = self.__class__.max_episode_timesteps

        agent = self.agent_spec(**agent)

        if num_episodes is None:
            num_updates = 2
        else:
            num_updates = None

        runner = Runner(agent=agent,
                        environment=environment,
                        max_episode_timesteps=max_episode_timesteps)
        runner.run(num_episodes=num_episodes,
                   num_updates=num_updates,
                   use_tqdm=False)
        runner.close()

        # Test experience-update, independent, deterministic
        if experience_update or (experience_update is None
                                 and self.__class__.experience_update):
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps)
            agent = Agent.create(agent=agent, environment=environment)
            for episode in range(
                    num_updates if num_episodes is None else num_episodes):
                episode_states = list()
                episode_internals = list()
                episode_actions = list()
                episode_terminal = list()
                episode_reward = list()
                states = environment.reset()
                internals = agent.initial_internals()
                terminal = False
                deterministic = True
                while not terminal:
                    episode_states.append(states)
                    episode_internals.append(internals)
                    actions, internals = agent.act(states=states,
                                                   internals=internals,
                                                   independent=True,
                                                   deterministic=deterministic)
                    deterministic = not deterministic
                    episode_actions.append(actions)
                    states, terminal, reward = environment.execute(
                        actions=actions)
                    episode_terminal.append(terminal)
                    episode_reward.append(reward)
                agent.experience(states=episode_states,
                                 internals=episode_internals,
                                 actions=episode_actions,
                                 terminal=episode_terminal,
                                 reward=episode_reward)
                agent.update()

        self.finished_test()
Exemplo n.º 18
0
environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)

# Instantiate a Tensorforce agent
# agent = Agent.create(
#     agent='tensorforce',
#     environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
#     memory=10000,
#     update=dict(unit='timesteps', batch_size=64),
#     optimizer=dict(type='adam', learning_rate=3e-4),
#     policy=dict(network='auto'),
#     objective='policy_gradient',
#     reward_estimation=dict(horizon=20)
# )

agent = Agent.create(agent='ppo',
                     environment=environment,
                     batch_size=10,
                     learning_rate=1e-3)

# agent = PPOAgent(
#     states_spec=environment.states,
#     actions_spec=environment.actions,
#     network_spec=network_spec,
#     batch_size=4096,
#     # BatchAgent
#     keep_last_timestep=True,
#     # PPOAgent
#     step_optimizer=dict(
#         type='adam',
#         learning_rate=1e-3
#     ),
#     optimization_steps=10,
    0     Thigh Joint Motor                             -1             1
    1     Leg Joint Motor                               -1             1
    2     Foot Joint Motor                              -1             1
    3     Thigh Left Joint Motor                        -1             1
    4     Leg Left Joint Motor                          -1             1
    5     Foot Left Joint Motor                         -1             1
Termination:
        done = not (height > 0.8 and height < 2.0 and
                    ang > -1.0 and ang < 1.0)
'''

if __name__ == "__main__":
    reward_record_without = []

    agent_without = Agent.create(agent='agent.json',
                                 environment=environment,
                                 exploration=exploration)
    states = environment.reset()
    reward_record_without = []
    terminal = False
    print('Training Normal Agent')
    for _ in tqdm(range(episode_number)):
        episode_reward = 0
        states = environment.reset()
        terminal = False
        while not terminal:
            actions = agent_without.act(states=states)
            states, terminal, reward = environment.execute(actions=actions)
            episode_reward += reward
            agent_without.observe(terminal=terminal, reward=reward)
        reward_record_without.append(episode_reward)
Exemplo n.º 20
0
@author: avales
"""

from tensorforce import Agent, Environment

# Pre-defined or custom environment
environment = Environment.create(environment='gym',
                                 level='CartPole',
                                 max_episode_timesteps=500)

# Instantiate a Tensorforce agent
agent = Agent.create(
    agent='tensorforce',
    environment=
    environment,  # alternatively: states, actions, (max_episode_timesteps)
    memory=10000,
    update=dict(unit='timesteps', batch_size=64),
    optimizer=dict(type='adam', learning_rate=3e-4),
    policy=dict(network='auto'),
    objective='policy_gradient',
    reward_estimation=dict(horizon=20))

# Train for 300 episodes
for _ in range(300):

    # Initialize episode
    states = environment.reset()
    terminal = False

    while not terminal:
        # Episode timestep
        actions = agent.act(states=states)
Exemplo n.º 21
0
    def test_act_experience_update(self):
        self.start_tests(name='act-experience-update')

        # ====================

        environment = Environment.create(
            environment='benchmarks/configs/cartpole.json')
        agent = Agent.create(agent='benchmarks/configs/ppo.json',
                             environment=environment)

        # Train for 100 episodes
        for episode in range(10):

            # Record episode experience
            episode_states = list()
            episode_internals = list()
            episode_actions = list()
            episode_terminal = list()
            episode_reward = list()

            # Episode using independent-act and agent.intial_internals()
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            sum_reward = 0.0
            while not terminal:
                episode_states.append(states)
                episode_internals.append(internals)
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True)
                episode_actions.append(actions)
                states, terminal, reward = environment.execute(actions=actions)
                episode_terminal.append(terminal)
                episode_reward.append(reward)
                sum_reward += reward
            print('Episode {}: {}'.format(episode, sum_reward))

            # Feed recorded experience to agent
            agent.experience(states=episode_states,
                             internals=episode_internals,
                             actions=episode_actions,
                             terminal=episode_terminal,
                             reward=episode_reward)

            # Perform update
            agent.update()

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(10):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True,
                                               deterministic=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward
        print('Mean evaluation return:', sum_rewards / 100.0)

        # Close agent and environment
        agent.close()
        environment.close()

        # ====================

        self.finished_test()
Exemplo n.º 22
0
    def test_temperature_controller(self):
        self.start_tests(name='temperature-controller')

        # ====================

        import pandas as pd
        import matplotlib.pyplot as plt
        import numpy as np
        import math

        ## Compute the response for a given action and current temperature
        def respond(action, current_temp, tau):
            return action + (current_temp - action) * math.exp(-1.0 / tau)

        ## Actions of a series of on, then off
        sAction = pd.Series(
            np.array(
                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
        sResponse = np.zeros(sAction.size)

        ## Update the response with the response to the action
        for i in range(sAction.size):
            ## Get last response
            if i == 0:
                last_response = 0
            else:
                last_response = sResponse[i - 1]
            sResponse[i] = respond(sAction[i], last_response, 3.0)

        ## Assemble and plot
        df = pd.DataFrame(list(zip(sAction, sResponse)),
                          columns=['action', 'response'])
        df.plot()

        # ====================

        def reward(temp):
            delta = abs(temp - 0.5)
            if delta < 0.1:
                return 0.0
            else:
                return -delta + 0.1

        temps = [x * 0.01 for x in range(100)]
        rewards = [reward(x) for x in temps]

        fig = plt.figure(figsize=(12, 4))

        plt.scatter(temps, rewards)
        plt.xlabel('Temperature')
        plt.ylabel('Reward')
        plt.title('Reward vs. Temperature')

        # ====================

        ###-----------------------------------------------------------------------------
        ## Imports
        from tensorforce.environments import Environment
        from tensorforce.agents import Agent

        ###-----------------------------------------------------------------------------
        ### Environment definition
        class ThermostatEnvironment(Environment):
            """This class defines a simple thermostat environment.  It is a room with
            a heater, and when the heater is on, the room temperature will approach
            the max heater temperature (usually 1.0), and when off, the room will
            decay to a temperature of 0.0.  The exponential constant that determines
            how fast it approaches these temperatures over timesteps is tau.
            """
            def __init__(self):
                ## Some initializations.  Will eventually parameterize this in the constructor.
                self.tau = 3.0
                self.current_temp = np.random.random(size=(1, ))

                super().__init__()

            def states(self):
                return dict(type='float', shape=(1, ))

            def actions(self):
                """Action 0 means no heater, temperature approaches 0.0.  Action 1 means
                the heater is on and the room temperature approaches 1.0.
                """
                return dict(type='int', num_values=2)

            # Optional, should only be defined if environment has a natural maximum
            # episode length
            def max_episode_timesteps(self):
                return super().max_episode_timesteps()

            # Optional
            def close(self):
                super().close()

            def reset(self):
                """Reset state.
                """
                # state = np.random.random(size=(1,))
                self.timestep = 0
                self.current_temp = np.random.random(size=(1, ))
                return self.current_temp

            def response(self, action):
                """Respond to an action.  When the action is 1, the temperature
                exponentially decays approaches 1.0.  When the action is 0,
                the current temperature decays towards 0.0.
                """
                return action + (self.current_temp - action) * math.exp(
                    -1.0 / self.tau)

            def reward_compute(self):
                """ The reward here is 0 if the current temp is between 0.4 and 0.6,
                else it is distance the temp is away from the 0.4 or 0.6 boundary.
                
                Return the value within the numpy array, not the numpy array.
                """
                delta = abs(self.current_temp - 0.5)
                if delta < 0.1:
                    return 0.0
                else:
                    return -delta[0] + 0.1

            def execute(self, actions):
                ## Check the action is either 0 or 1 -- heater on or off.
                assert actions == 0 or actions == 1

                ## Increment timestamp
                self.timestep += 1

                ## Update the current_temp
                self.current_temp = self.response(actions)

                ## Compute the reward
                reward = self.reward_compute()

                ## The only way to go terminal is to exceed max_episode_timestamp.
                ## terminal == False means episode is not done
                ## terminal == True means it is done.
                terminal = False
                if self.timestep > self.max_episode_timesteps():
                    terminal = True

                return self.current_temp, terminal, reward

        ###-----------------------------------------------------------------------------
        ### Create the environment
        ###   - Tell it the environment class
        ###   - Set the max timestamps that can happen per episode
        environment = environment = Environment.create(
            environment=ThermostatEnvironment, max_episode_timesteps=100)

        # ====================

        agent = Agent.create(agent='tensorforce',
                             environment=environment,
                             update=64,
                             objective='policy_gradient',
                             reward_estimation=dict(horizon=1))

        # ====================

        ### Initialize
        environment.reset()

        ## Creation of the environment via Environment.create() creates
        ## a wrapper class around the original Environment defined here.
        ## That wrapper mainly keeps track of the number of timesteps.
        ## In order to alter the attributes of your instance of the original
        ## class, like to set the initial temp to a custom value, like here,
        ## you need to access the `environment` member of this wrapped class.
        ## That is why you see the way to set the current_temp like below.
        environment.environment.current_temp = np.array([0.5])
        states = environment.environment.current_temp

        internals = agent.initial_internals()
        terminal = False

        ### Run an episode
        temp = [environment.environment.current_temp[0]]
        while not terminal:
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           evaluation=True)
            states, terminal, reward = environment.execute(actions=actions)
            temp += [states[0]]

        ### Plot the run
        plt.figure(figsize=(12, 4))
        ax = plt.subplot()
        ax.set_ylim([0.0, 1.0])
        plt.plot(range(len(temp)), temp)
        plt.hlines(y=0.4, xmin=0, xmax=99, color='r')
        plt.hlines(y=0.6, xmin=0, xmax=99, color='r')
        plt.xlabel('Timestep')
        plt.ylabel('Temperature')
        plt.title('Temperature vs. Timestep')
        plt.show()

        # Train for 200 episodes
        for _ in range(50):
            states = environment.reset()
            terminal = False
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        # ====================

        ### Initialize
        environment.reset()

        ## Creation of the environment via Environment.create() creates
        ## a wrapper class around the original Environment defined here.
        ## That wrapper mainly keeps track of the number of timesteps.
        ## In order to alter the attributes of your instance of the original
        ## class, like to set the initial temp to a custom value, like here,
        ## you need to access the `environment` member of this wrapped class.
        ## That is why you see the way to set the current_temp like below.
        environment.environment.current_temp = np.array([1.0])
        states = environment.environment.current_temp

        internals = agent.initial_internals()
        terminal = False

        ### Run an episode
        temp = [environment.environment.current_temp[0]]
        while not terminal:
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           evaluation=True)
            states, terminal, reward = environment.execute(actions=actions)
            temp += [states[0]]

        ### Plot the run
        plt.figure(figsize=(12, 4))
        ax = plt.subplot()
        ax.set_ylim([0.0, 1.0])
        plt.plot(range(len(temp)), temp)
        plt.hlines(y=0.4, xmin=0, xmax=99, color='r')
        plt.hlines(y=0.6, xmin=0, xmax=99, color='r')
        plt.xlabel('Timestep')
        plt.ylabel('Temperature')
        plt.title('Temperature vs. Timestep')
        plt.show()

        # ====================

        self.finished_test()
Exemplo n.º 23
0
    def test_save_load_agent(self):
        self.start_tests(name='save-load-agent')

        with TemporaryDirectory() as checkpoint_directory, TemporaryDirectory(
        ) as numpy_directory:

            # ====================

            # OpenAI-Gym environment initialization
            environment = Environment.create(
                environment='benchmarks/configs/cartpole.json')

            # PPO agent initialization
            agent = Agent.create(
                agent='benchmarks/configs/ppo.json',
                environment=environment,
                # Option 1: Saver - save agent periodically every 10 updates
                # and keep the 5 most recent checkpoints
                saver=dict(directory=checkpoint_directory,
                           frequency=1,
                           max_checkpoints=5),
            )

            # Runner initialization
            runner = Runner(agent=agent, environment=environment)

            # Training
            runner.run(num_episodes=10)
            runner.close()

            # Option 2: Explicit save
            # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model,
            # agent argument saver, specified above, uses 'checkpoint')
            agent.save(directory=numpy_directory,
                       format='numpy',
                       append='episodes')

            # Close agent separately, since created separately
            agent.close()

            # Load agent TensorFlow checkpoint
            agent = Agent.load(directory=checkpoint_directory,
                               format='checkpoint',
                               environment=environment)
            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=10, evaluation=True)
            runner.close()
            agent.close()

            # Load agent NumPy weights
            agent = Agent.load(directory=numpy_directory,
                               format='numpy',
                               environment=environment)
            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=10, evaluation=True)
            runner.close()
            agent.close()

            # Close environment separately, since created separately
            environment.close()

        # ====================

        self.finished_test()
Exemplo n.º 24
0
    def test_execution(self):
        self.start_tests(name='getting-started-execution')

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=10)
        runner.run(num_episodes=10)
        runner.run(num_episodes=5, evaluation=True)
        runner.close()
        self.finished_test()

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=50,
                        num_parallel=5,
                        remote='multiprocessing')
        runner.run(num_episodes=10)
        runner.close()
        self.finished_test()

        # Create agent and environment
        environment = Environment.create(
            environment='test/data/environment.json', max_episode_timesteps=10)
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)

        # Train for 100 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        # Train for 100 episodes
        for _ in range(10):
            episode_states = list()
            episode_internals = list()
            episode_actions = list()
            episode_terminal = list()
            episode_reward = list()

            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                episode_states.append(states)
                episode_internals.append(internals)
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True)
                episode_actions.append(actions)
                states, terminal, reward = environment.execute(actions=actions)
                episode_terminal.append(terminal)
                episode_reward.append(reward)

            agent.experience(states=episode_states,
                             internals=episode_internals,
                             actions=episode_actions,
                             terminal=episode_terminal,
                             reward=episode_reward)
            agent.update()

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(10):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               deterministic=True,
                                               independent=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward

        print('Mean episode reward:', sum_rewards / 100)

        # Close agent and environment
        agent.close()
        environment.close()

        self.finished_test()