示例#1
0
    def test_multi(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            return state0 * state1

        for _ in xrange(5):
            environment = MinimalTest(definition=[True, (True, 2)])
            config = Configuration(batch_size=16,
                                   learning_rate=0.00025,
                                   exploration=dict(type='ornstein_uhlenbeck'),
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 20 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-20:], r.episode_lengths[-20:]))

            runner.run(episodes=10000, episode_finished=episode_finished)
            print('NAF agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 10000:
                passed += 1

        print('NAF agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 0)
示例#2
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=50,
                                   memory='replay',
                                   first_update=20,
                                   repeat_update=4,
                                   target_update_frequency=10,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('Replay DQN: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('Replay DQN passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
            )
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('VPG Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('VPG discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
示例#4
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   exploration=dict(type='ornstein_uhlenbeck'),
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('NAF agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('NAF agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def test_reinforceio_homepage(self):
        """
        Code example from the homepage and README.md.
        """

        from tensorforce import Configuration
        from tensorforce.agents import TRPOAgent

        config = Configuration(batch_size=100, )

        # Create a Trust Region Policy Optimization agent
        agent = TRPOAgent(states_spec=dict(shape=(10, ), type='float'),
                          actions_spec=dict(type='int', num_actions=2),
                          network_spec=[
                              dict(type='dense', size=50),
                              dict(type='dense', size=50)
                          ],
                          config=config)

        # Get new data from somewhere, e.g. a client to a web app
        client = TestTutorialCode.MyClient('http://127.0.0.1', 8080)

        # Poll new state from client
        state = client.get_state()

        # Get prediction from agent, execute
        action = agent.act(states=state)
        reward = client.execute(action)

        # Add experience, agent automatically updates model according to batch size
        agent.observe(reward=reward, terminal=False)

        agent.close()
示例#6
0
    def test_baseline(self):
        config = Configuration(
            discount=0.75,
            batch_size=8,
            learning_rate=0.001,
        )
        # agent = VPGAgent(
        #     states_spec=dict(shape=(1,)),
        #     actions_spec=dict(type='int', num_actions=2),
        #     network_spec=[dict(type='dense', size=32)],
        #     config=config
        # )

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        discounted_rewards = np.array([
            0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2,
            0.75, 1.0, 0.0
        ])
        baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0])
        #agent.model.baseline = dict(state=Baseline())
        #agent.model.baseline['state'].predict = lambda states: baseline

        #result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals)
        expected = discounted_rewards - baseline
示例#7
0
    def test_basic(self):

        config = Configuration(
            discount=0.75,
            batch_size=8,
            learning_rate=0.001,
        )
        # agent = VPGAgent(
        #     states_spec=dict(shape=(1,)),
        #     actions_spec=dict(type='int', num_actions=2),
        #     network_spec=[dict(type='dense', size=32)],
        #     config=config
        # )

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        actions = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        discounted_rewards = np.array([
            0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2,
            0.75, 1.0, 0.0
        ])

        feed_dict = dict()
        # feed_dict[agent.model.reward_input] = rewards
        # fetches = [agent.model.get_reward()]
        # result = agent.model.session.run(feed_dict=feed_dict, fetches=fetches)

        expected = discounted_rewards
示例#8
0
    def test_multi_baseline(self):
        class CustomNetwork(LayerBasedNetwork):
            def tf_apply(self, x, internals, update, return_internals=False):
                layer01 = Dense(size=32, scope='state0-1')
                self.add_layer(layer=layer01)
                layer02 = Dense(size=32, scope='state0-2')
                self.add_layer(layer=layer02)
                x0 = layer02.apply(x=layer01.apply(x=x['state0'],
                                                   update=update),
                                   update=update)
                layer11 = Dense(size=32, scope='state1-1')
                self.add_layer(layer=layer11)
                layer12 = Dense(size=32, scope='state1-2')
                self.add_layer(layer=layer12)
                x1 = layer12.apply(x=layer11.apply(x=x['state1'],
                                                   update=update),
                                   update=update)
                layer21 = Dense(size=32, scope='state2-1')
                self.add_layer(layer=layer21)
                layer22 = Dense(size=32, scope='state2-2')
                self.add_layer(layer=layer22)
                x2 = layer22.apply(x=layer21.apply(x=x['state2'],
                                                   update=update),
                                   update=update)
                layer31 = Dense(size=32, scope='state3-1')
                self.add_layer(layer=layer31)
                layer32 = Dense(size=32, scope='state3-2')
                self.add_layer(layer=layer32)
                x3 = layer32.apply(x=layer31.apply(x=x['state3'],
                                                   update=update),
                                   update=update)
                x = x0 * x1 * x2 * x3
                return (x, list()) if return_internals else x

        environment = MinimalTest(
            specification=[('bool',
                            ()), ('int',
                                  (2, )), ('float',
                                           (1, 1)), ('bounded-float', (1, ))])
        config = Configuration(
            batch_size=8,
            baseline_mode='states',
            baseline=dict(type='aggregated',
                          baselines=dict(state0=dict(type='mlp',
                                                     sizes=[32, 32]),
                                         state1=dict(type='mlp',
                                                     sizes=[32, 32]),
                                         state2=dict(type='mlp',
                                                     sizes=[32, 32]),
                                         state3=dict(type='mlp',
                                                     sizes=[32, 32]))),
            baseline_optimizer=dict(type='multi_step',
                                    optimizer=dict(type='adam',
                                                   learning_rate=0.001),
                                    num_steps=5))
        self.base_test(name='multi-baseline',
                       environment=environment,
                       network_spec=CustomNetwork,
                       config=config)
示例#9
0
class TestTRPOAgent(BaseAgentTest, unittest.TestCase):

    agent = TRPOAgent
    deterministic = False
    config = Configuration(
        batch_size=64,
        normalize_rewards=True
    )
示例#10
0
    def __init__(self, *args, **kwargs):
        """
        Initialize configuration using the default config. Then update the config first using *args (order is
        defined in self.config_args) and then using **kwargs)

        :param args: optional *args
        :param kwargs: optional **kwargs
        """
        self.config = Configuration()

        for i, arg in enumerate(args):
            if i >= len(self.config_args):
                break
            self.config.default({self.config_args[i]: arg})

        self.config.default(kwargs)
        self.config.default(self.default_config)
    def test_multi(self):
        passed = 0

        def network_builder(inputs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            state2 = layer(x=layer(x=inputs['state2'], size=32), size=32)
            return state0 * state1 * state2

        for _ in xrange(5):
            environment = MinimalTest(
                definition=[False, (False, 2), (False, (1, 2))])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   demo_memory_capacity=100,
                                   demo_sampling_ratio=0.2,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = dict(action0=1, action1=(1, 1), action2=((1, 1), ))
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 50 or not all(
                    x >= 1.0 for x in r.episode_rewards[-50:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
示例#12
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   demo_memory_capacity=100,
                                   demo_sampling_ratio=0.2,
                                   memory=dict(type='replay',
                                               random_sampling=True),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
示例#13
0
    def test_dqfd_agent(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=16,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                demo_memory_capacity=100,
                demo_sampling_ratio=0.1,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder(layers_config=[
                    dict(type='dense', size=32, l2_regularization=0.0001)
                ]))
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
示例#14
0
 def test_adam(self):
     environment = MinimalTest(specification=[('int', ())])
     network_spec = [
         dict(type='dense', size=32),
         dict(type='dense', size=32)
     ]
     config = Configuration(batch_size=8,
                            optimizer=dict(type='adam', learning_rate=1e-3))
     self.base_test(name='adam',
                    environment=environment,
                    network_spec=network_spec,
                    config=config)
示例#15
0
class TestRandomAgent(BaseAgentTest, unittest.TestCase):

    agent = RandomAgent
    deterministic = False
    requires_network = False
    # Random agent is not expected to pass anything
    pass_threshold = 0.0

    config = Configuration()

    # Not using a network so no point in testing LSTM
    exclude_lstm = True
示例#16
0
class TestDDQNAgent(BaseAgentTest, unittest.TestCase):

    agent = DDQNAgent
    deterministic = True

    config = Configuration(memory=dict(type='replay', capacity=1000),
                           batch_size=8,
                           first_update=10,
                           target_sync_frequency=10)

    exclude_float = True
    exclude_bounded = True
示例#17
0
def getAgent(shapeIn, shapeOut):

    config = Configuration(batch_size=1,
                           step_optimizer=dict(type='adam',
                                               learning_rate=1e-4))

    # Create a Proximal Policy Optimization agent
    agent = PPOAgent(dict(type='float', shape=shapeIn[0]),
                     dict(type='float', shape=shapeOut[0]), [
                         dict(type='dense', size=64),
                     ], config)

    return agent
 def test_baseline_no_optimizer(self):
     environment = MinimalTest(specification=[('int', ())])
     network_spec = [
         dict(type='dense', size=32),
         dict(type='dense', size=32)
     ]
     config = Configuration(batch_size=8,
                            baseline_mode='states',
                            baseline=dict(type='mlp', sizes=[32, 32]))
     self.base_test(name='baseline-no-optimizer',
                    environment=environment,
                    network_spec=network_spec,
                    config=config)
示例#19
0
 def test_replay(self):
     environment = MinimalTest(specification=[('int', ())])
     network_spec = [
         dict(type='dense', size=32),
         dict(type='dense', size=32)
     ]
     config = Configuration(memory=dict(type='replay', capacity=1000),
                            batch_size=8,
                            first_update=10,
                            target_sync_frequency=10)
     self.base_test(name='replay',
                    environment=environment,
                    network_spec=network_spec,
                    config=config)
示例#20
0
class TestDQNAgent(BaseAgentTest, unittest.TestCase):

    agent = DQNAgent
    deterministic = True

    config = Configuration(
        batch_size=8,
        memory_capacity=800,
        first_update=80,
        target_update_frequency=20
    )

    exclude_float = True
    exclude_bounded = True
示例#21
0
class TestDQNAgent(BaseAgentTest, unittest.TestCase):

    agent = DQNAgent
    deterministic = True

    config = Configuration(memory=dict(type='replay', capacity=1000),
                           optimizer=dict(type="adam", learning_rate=0.002),
                           repeat_update=4,
                           batch_size=32,
                           first_update=64,
                           target_sync_frequency=10)

    exclude_float = True
    exclude_bounded = True
示例#22
0
    def test_discrete(self):
        environment = MinimalTest(definition=False)
        config = Configuration(states=environment.states,
                               actions=environment.actions)
        agent = RandomAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                r.episode_rewards[-100:], r.episode_lengths[-100:]))

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Random agent (discrete): ' + str(runner.episode))
        self.assertTrue(runner.episode == 1000)
示例#23
0
    def test_continuous(self):
        environment = MinimalTest(definition=True)
        config = Configuration(states=environment.states,
                               actions=environment.actions)
        agent = RandomAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(
                x >= 1.0 for x in r.episode_rewards[-100:])

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Random agent (continuous): ' + str(runner.episode))
        self.assertTrue(runner.episode == 1000)
示例#24
0
    def test_multi_baseline(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'],
                                   size=32,
                                   scope='state0-1'),
                           size=32,
                           scope='state0-2')
            state1 = layer(x=layer(x=inputs['state1'],
                                   size=32,
                                   scope='state1-1'),
                           size=32,
                           scope='state1-2')
            state2 = layer(x=layer(x=inputs['state2'],
                                   size=32,
                                   scope='state2-1'),
                           size=32,
                           scope='state2-2')
            return state0 * state1 * state2

        for _ in xrange(5):
            environment = MinimalTest(
                definition=[False, (False, 2), (True, 2)])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   baseline=dict(type="mlp",
                                                 sizes=[32, 32],
                                                 epochs=5,
                                                 update_batch_size=8,
                                                 learning_rate=0.01),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=4000, episode_finished=episode_finished)
            print('VPG agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 4000:
                passed += 1

        print('VPG agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
示例#25
0
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=256,

                memory=dict(
                    type='prioritized_replay',
                ),
                update_frequency=256,
                first_update=512,

                learning_rate=0.0001,
                optimizer_batch_size=64,
                normalize_rewards=False,
                gae_rewards=False,
                baseline=dict(
                    type="mlp",
                    sizes=[32, 32],
                    epochs=1,
                    update_batch_size=64,
                    learning_rate=0.001
                ),
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
示例#26
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                exploration=dict(type='ornstein_uhlenbeck'),
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                clip_gradients=1.0,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
                # batch_size=8,
                # learning_rate=0.0025,
                # # exploration="OrnsteinUhlenbeckProcess",
                # # exploration_kwargs=dict(
                # #     sigma=0.1,
                # #     mu=0,
                # #     theta=0.1
                # # ),
                # discount=0.99,
                # memory_capacity=800,
                # first_update=80,
                # repeat_update=4,
                # target_update_frequency=20,
                # states=environment.states,
                # actions=environment.actions,
                # clip_gradients=5.0,
                # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)])
            )
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('NAF Agent: ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('NAF Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 3)
示例#27
0
    def test_multi(self):
        environment = MinimalTest(
            definition=[False, (False, 2), (False, (1, 2)), (True, (1, 2))])
        config = Configuration(states=environment.states,
                               actions=environment.actions)
        agent = RandomAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 20 or not all(x >= 1.0
                                             for x in r.episode_rewards[-20:])

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Random agent (multi-state/action): ' + str(runner.episode))
        self.assertTrue(runner.episode == 1000)
示例#28
0
class TestDQNNstepAgent(BaseAgentTest, unittest.TestCase):

    agent = DQNNstepAgent
    deterministic = True

    config = Configuration(
        batch_size=8,
        optimizer=dict(
            type='adam',
            learning_rate=1e-2
        )
    )

    exclude_float = True
    exclude_bounded = True
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = TRPOAgent(config=Configuration(
                log_level='info',
                batch_size=100,
                baseline=dict(
                    type='mlp',
                    size=32,
                    hidden_layers=1,
                    epochs=20,
                    update_batch_size=32
                ),
                generalized_advantage_estimation=True,
                normalize_advantage=False,
                gae_lambda=0.97,
                max_kl_divergence=0.005,
                cg_iterations=20,
                cg_damping=0.01,
                ls_max_backtracks=20,
                ls_override=False,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
示例#30
0
    def test_multi(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'],
                                   size=32,
                                   scope='state0-1'),
                           size=32,
                           scope='state0-2')
            state1 = layer(x=layer(x=inputs['state1'],
                                   size=32,
                                   scope='state1-1'),
                           size=32,
                           scope='state1-2')
            return state0 * state1

        for _ in xrange(5):
            environment = MinimalTest(definition=[False, (False, 2)])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   memory=dict(type='replay',
                                               random_sampling=True),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = CategoricalDQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 15 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-15:], r.episode_lengths[-15:]))

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('Categorical DQN agent (multi-state/action): ' +
                  str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('Categorical DQN agent (multi-state/action) passed = {}'.format(
            passed))
        self.assertTrue(passed >= 2)
示例#31
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', default='DQNAgent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode")
    # parser.add_argument('-m', '--monitor', help="Save results to this directory")
    # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    env = OpenAIUniverse(args.gym_id)
    env.configure(remotes=1)

    default = dict(
        repeat_actions=1,
        actions=env.actions,
        states=env.states,
        max_episode_length=args.max_timesteps
    )

    if args.agent_config:
        config = Configuration.from_json(args.agent_config)
    else:
        config = Configuration()

    config.default(default)

    if args.network_config:
        network_config = Configuration.from_json(args.network_config).network_layers
    else:
        if config.network_layers:
            network_config = config.network_layers
        else:
            raise TensorForceError("Error: No network configuration provided.")

    if args.debug:
        print("Configuration:")
        print(config)

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[config.loglevel])

    stack = None

    agent = create_agent(args.agent, config, network_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(config)

    runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))
        runner.save_model(args.save, args.save_episodes)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:])))
            logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:])))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    if args.monitor:
        env.gym.monitor.close()
    env.close()
示例#32
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode")
    parser.add_argument('-w', '--num-workers', type=int, default=1, help="Number of worker agents")
    parser.add_argument('-m', '--monitor', help="Save results to this file")
    parser.add_argument('-M', '--mode', choices=['tmux', 'child'], default='tmux', help="Starter mode")
    parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory")
    parser.add_argument('-C', '--is-child', action='store_true')
    parser.add_argument('-i', '--task-index', type=int, default=0, help="Task index")
    parser.add_argument('-K', '--kill', action='store_true', default=False, help="Kill runners")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    session_name = 'openai_async'
    shell = '/bin/bash'

    kill_cmds = [
        "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers),
        "tmux kill-session -t {}".format(session_name),
    ]
    if args.kill:
        os.system("\n".join(kill_cmds))
        return 0

    if not args.is_child:
        # start up child processes
        target_script = os.path.abspath(inspect.stack()[0][1])

        def wrap_cmd(session, name, cmd):
            if isinstance(cmd, list):
                cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd)
            if args.mode == 'tmux':
                return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd))
            elif args.mode == 'child':
                return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format(
                    cmd, args.logdir, session, name, args.logdir
                )

        def build_cmd(index):
            cmd_args = [
                'CUDA_VISIBLE_DEVICES=',
                sys.executable, target_script,
                args.gym_id,
                '--is-child',
                '--agent', args.agent,
                '--agent-config', os.path.join(os.getcwd(), args.agent_config),
                '--network-config', os.path.join(os.getcwd(), args.network_config),
                '--num-workers', args.num_workers,
                '--task-index', index
            ]
            if args.debug:
                cmd_args.append('--debug')
            return cmd_args

        if args.mode == 'tmux':
            cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)]
        elif args.mode == 'child':
            cmds = ['mkdir -p {}'.format(args.logdir),
                    'rm -f {}/kill.sh'.format(args.logdir),
                    'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir),
                    'chmod +x {}/kill.sh'.format(args.logdir)]
        cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1)))

        for i in xrange(args.num_workers):
            name = 'w_{}'.format(i)
            if args.mode == 'tmux':
                cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell))
            cmds.append(wrap_cmd(session_name, name, build_cmd(i)))

        # add one PS call
        # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell))

        print("\n".join(cmds))

        os.system("\n".join(cmds))

        return 0

    ps_hosts = ['127.0.0.1:{}'.format(12222)]
    worker_hosts = []
    port = 12223
    for _ in range(args.num_workers):
        worker_hosts.append('127.0.0.1:{}'.format(port))
        port += 1
    cluster = {'ps': ps_hosts, 'worker': worker_hosts}
    cluster_spec = tf.train.ClusterSpec(cluster)

    environment = OpenAIGym(args.gym_id)

    if args.agent_config:
        agent_config = Configuration.from_json(args.agent_config)
    else:
        raise TensorForceError("No agent configuration provided.")
    if not args.network_config:
        raise TensorForceError("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(args.network_config)))

    agent_config.default(dict(distributed=True, cluster_spec=cluster_spec, global_model=(args.task_index == -1), device=('/job:ps' if args.task_index == -1 else '/job:worker/task:{}/cpu:0'.format(args.task_index))))

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[agent_config.loglevel])

    agent = agents[args.agent](config=agent_config)

    logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id))
    logger.info("Config:")
    logger.info(agent_config)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        cluster_spec=cluster_spec,
        task_index=args.task_index
    )

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
示例#33
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode")
    parser.add_argument('-m', '--monitor', help="Save results to this directory")
    parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # configurable!!!

    environment = OpenAIGym(args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video)

    if args.agent_config:
        agent_config = Configuration.from_json(args.agent_config)
    else:
        agent_config = Configuration()
        logger.info("No agent configuration provided.")
    if args.network_config:
        network = from_json(args.network_config)
    else:
        network = None
        logger.info("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions, network=network))
    agent = agents[args.agent](config=agent_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        save_path=args.save,
        save_episodes=args.save_episodes
    )

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    if args.monitor:
        environment.gym.monitor.close()
    environment.close()