def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[True, (True, 2)]) config = Configuration(batch_size=16, learning_rate=0.00025, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=network_builder) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 20 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-20:], r.episode_lengths[-20:])) runner.run(episodes=10000, episode_finished=episode_finished) print('NAF agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 10000: passed += 1 print('NAF agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 0)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory='replay', first_update=20, repeat_update=4, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Replay DQN: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('Replay DQN passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) ) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('VPG Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('VPG discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('NAF agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('NAF agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_reinforceio_homepage(self): """ Code example from the homepage and README.md. """ from tensorforce import Configuration from tensorforce.agents import TRPOAgent config = Configuration(batch_size=100, ) # Create a Trust Region Policy Optimization agent agent = TRPOAgent(states_spec=dict(shape=(10, ), type='float'), actions_spec=dict(type='int', num_actions=2), network_spec=[ dict(type='dense', size=50), dict(type='dense', size=50) ], config=config) # Get new data from somewhere, e.g. a client to a web app client = TestTutorialCode.MyClient('http://127.0.0.1', 8080) # Poll new state from client state = client.get_state() # Get prediction from agent, execute action = agent.act(states=state) reward = client.execute(action) # Add experience, agent automatically updates model according to batch size agent.observe(reward=reward, terminal=False) agent.close()
def test_baseline(self): config = Configuration( discount=0.75, batch_size=8, learning_rate=0.001, ) # agent = VPGAgent( # states_spec=dict(shape=(1,)), # actions_spec=dict(type='int', num_actions=2), # network_spec=[dict(type='dense', size=32)], # config=config # ) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] discounted_rewards = np.array([ 0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2, 0.75, 1.0, 0.0 ]) baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) #agent.model.baseline = dict(state=Baseline()) #agent.model.baseline['state'].predict = lambda states: baseline #result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) expected = discounted_rewards - baseline
def test_basic(self): config = Configuration( discount=0.75, batch_size=8, learning_rate=0.001, ) # agent = VPGAgent( # states_spec=dict(shape=(1,)), # actions_spec=dict(type='int', num_actions=2), # network_spec=[dict(type='dense', size=32)], # config=config # ) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] actions = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] discounted_rewards = np.array([ 0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2, 0.75, 1.0, 0.0 ]) feed_dict = dict() # feed_dict[agent.model.reward_input] = rewards # fetches = [agent.model.get_reward()] # result = agent.model.session.run(feed_dict=feed_dict, fetches=fetches) expected = discounted_rewards
def test_multi_baseline(self): class CustomNetwork(LayerBasedNetwork): def tf_apply(self, x, internals, update, return_internals=False): layer01 = Dense(size=32, scope='state0-1') self.add_layer(layer=layer01) layer02 = Dense(size=32, scope='state0-2') self.add_layer(layer=layer02) x0 = layer02.apply(x=layer01.apply(x=x['state0'], update=update), update=update) layer11 = Dense(size=32, scope='state1-1') self.add_layer(layer=layer11) layer12 = Dense(size=32, scope='state1-2') self.add_layer(layer=layer12) x1 = layer12.apply(x=layer11.apply(x=x['state1'], update=update), update=update) layer21 = Dense(size=32, scope='state2-1') self.add_layer(layer=layer21) layer22 = Dense(size=32, scope='state2-2') self.add_layer(layer=layer22) x2 = layer22.apply(x=layer21.apply(x=x['state2'], update=update), update=update) layer31 = Dense(size=32, scope='state3-1') self.add_layer(layer=layer31) layer32 = Dense(size=32, scope='state3-2') self.add_layer(layer=layer32) x3 = layer32.apply(x=layer31.apply(x=x['state3'], update=update), update=update) x = x0 * x1 * x2 * x3 return (x, list()) if return_internals else x environment = MinimalTest( specification=[('bool', ()), ('int', (2, )), ('float', (1, 1)), ('bounded-float', (1, ))]) config = Configuration( batch_size=8, baseline_mode='states', baseline=dict(type='aggregated', baselines=dict(state0=dict(type='mlp', sizes=[32, 32]), state1=dict(type='mlp', sizes=[32, 32]), state2=dict(type='mlp', sizes=[32, 32]), state3=dict(type='mlp', sizes=[32, 32]))), baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=0.001), num_steps=5)) self.base_test(name='multi-baseline', environment=environment, network_spec=CustomNetwork, config=config)
class TestTRPOAgent(BaseAgentTest, unittest.TestCase): agent = TRPOAgent deterministic = False config = Configuration( batch_size=64, normalize_rewards=True )
def __init__(self, *args, **kwargs): """ Initialize configuration using the default config. Then update the config first using *args (order is defined in self.config_args) and then using **kwargs) :param args: optional *args :param kwargs: optional **kwargs """ self.config = Configuration() for i, arg in enumerate(args): if i >= len(self.config_args): break self.config.default({self.config_args[i]: arg}) self.config.default(kwargs) self.config.default(self.default_config)
def test_multi(self): passed = 0 def network_builder(inputs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) state2 = layer(x=layer(x=inputs['state2'], size=32), size=32) return state0 * state1 * state2 for _ in xrange(5): environment = MinimalTest( definition=[False, (False, 2), (False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.2, states=environment.states, actions=environment.actions, network=network_builder) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = dict(action0=1, action1=(1, 1), action2=((1, 1), )) state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 50 or not all( x >= 1.0 for x in r.episode_rewards[-50:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.2, memory=dict(type='replay', random_sampling=True), states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_dqfd_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=16, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.1, states=environment.states, actions=environment.actions, network=layered_network_builder(layers_config=[ dict(type='dense', size=32, l2_regularization=0.0001) ])) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_adam(self): environment = MinimalTest(specification=[('int', ())]) network_spec = [ dict(type='dense', size=32), dict(type='dense', size=32) ] config = Configuration(batch_size=8, optimizer=dict(type='adam', learning_rate=1e-3)) self.base_test(name='adam', environment=environment, network_spec=network_spec, config=config)
class TestRandomAgent(BaseAgentTest, unittest.TestCase): agent = RandomAgent deterministic = False requires_network = False # Random agent is not expected to pass anything pass_threshold = 0.0 config = Configuration() # Not using a network so no point in testing LSTM exclude_lstm = True
class TestDDQNAgent(BaseAgentTest, unittest.TestCase): agent = DDQNAgent deterministic = True config = Configuration(memory=dict(type='replay', capacity=1000), batch_size=8, first_update=10, target_sync_frequency=10) exclude_float = True exclude_bounded = True
def getAgent(shapeIn, shapeOut): config = Configuration(batch_size=1, step_optimizer=dict(type='adam', learning_rate=1e-4)) # Create a Proximal Policy Optimization agent agent = PPOAgent(dict(type='float', shape=shapeIn[0]), dict(type='float', shape=shapeOut[0]), [ dict(type='dense', size=64), ], config) return agent
def test_baseline_no_optimizer(self): environment = MinimalTest(specification=[('int', ())]) network_spec = [ dict(type='dense', size=32), dict(type='dense', size=32) ] config = Configuration(batch_size=8, baseline_mode='states', baseline=dict(type='mlp', sizes=[32, 32])) self.base_test(name='baseline-no-optimizer', environment=environment, network_spec=network_spec, config=config)
def test_replay(self): environment = MinimalTest(specification=[('int', ())]) network_spec = [ dict(type='dense', size=32), dict(type='dense', size=32) ] config = Configuration(memory=dict(type='replay', capacity=1000), batch_size=8, first_update=10, target_sync_frequency=10) self.base_test(name='replay', environment=environment, network_spec=network_spec, config=config)
class TestDQNAgent(BaseAgentTest, unittest.TestCase): agent = DQNAgent deterministic = True config = Configuration( batch_size=8, memory_capacity=800, first_update=80, target_update_frequency=20 ) exclude_float = True exclude_bounded = True
class TestDQNAgent(BaseAgentTest, unittest.TestCase): agent = DQNAgent deterministic = True config = Configuration(memory=dict(type='replay', capacity=1000), optimizer=dict(type="adam", learning_rate=0.002), repeat_update=4, batch_size=32, first_update=64, target_sync_frequency=10) exclude_float = True exclude_bounded = True
def test_discrete(self): environment = MinimalTest(definition=False) config = Configuration(states=environment.states, actions=environment.actions) agent = RandomAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Random agent (discrete): ' + str(runner.episode)) self.assertTrue(runner.episode == 1000)
def test_continuous(self): environment = MinimalTest(definition=True) config = Configuration(states=environment.states, actions=environment.actions) agent = RandomAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Random agent (continuous): ' + str(runner.episode)) self.assertTrue(runner.episode == 1000)
def test_multi_baseline(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32, scope='state0-1'), size=32, scope='state0-2') state1 = layer(x=layer(x=inputs['state1'], size=32, scope='state1-1'), size=32, scope='state1-2') state2 = layer(x=layer(x=inputs['state2'], size=32, scope='state2-1'), size=32, scope='state2-2') return state0 * state1 * state2 for _ in xrange(5): environment = MinimalTest( definition=[False, (False, 2), (True, 2)]) config = Configuration(batch_size=8, learning_rate=0.001, baseline=dict(type="mlp", sizes=[32, 32], epochs=5, update_batch_size=8, learning_rate=0.01), states=environment.states, actions=environment.actions, network=network_builder) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=4000, episode_finished=episode_finished) print('VPG agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 4000: passed += 1 print('VPG agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=256, memory=dict( type='prioritized_replay', ), update_frequency=256, first_update=512, learning_rate=0.0001, optimizer_batch_size=64, normalize_rewards=False, gae_rewards=False, baseline=dict( type="mlp", sizes=[32, 32], epochs=1, update_batch_size=64, learning_rate=0.001 ), states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, clip_gradients=1.0, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) # batch_size=8, # learning_rate=0.0025, # # exploration="OrnsteinUhlenbeckProcess", # # exploration_kwargs=dict( # # sigma=0.1, # # mu=0, # # theta=0.1 # # ), # discount=0.99, # memory_capacity=800, # first_update=80, # repeat_update=4, # target_update_frequency=20, # states=environment.states, # actions=environment.actions, # clip_gradients=5.0, # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)]) ) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('NAF Agent: ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('NAF Agent passed = {}'.format(passed)) self.assertTrue(passed >= 3)
def test_multi(self): environment = MinimalTest( definition=[False, (False, 2), (False, (1, 2)), (True, (1, 2))]) config = Configuration(states=environment.states, actions=environment.actions) agent = RandomAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 20 or not all(x >= 1.0 for x in r.episode_rewards[-20:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Random agent (multi-state/action): ' + str(runner.episode)) self.assertTrue(runner.episode == 1000)
class TestDQNNstepAgent(BaseAgentTest, unittest.TestCase): agent = DQNNstepAgent deterministic = True config = Configuration( batch_size=8, optimizer=dict( type='adam', learning_rate=1e-2 ) ) exclude_float = True exclude_bounded = True
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( log_level='info', batch_size=100, baseline=dict( type='mlp', size=32, hidden_layers=1, epochs=20, update_batch_size=32 ), generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, max_kl_divergence=0.005, cg_iterations=20, cg_damping=0.01, ls_max_backtracks=20, ls_override=False, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32, scope='state0-1'), size=32, scope='state0-2') state1 = layer(x=layer(x=inputs['state1'], size=32, scope='state1-1'), size=32, scope='state1-2') return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[False, (False, 2)]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, memory=dict(type='replay', random_sampling=True), states=environment.states, actions=environment.actions, network=network_builder) agent = CategoricalDQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-15:], r.episode_lengths[-15:])) runner.run(episodes=2000, episode_finished=episode_finished) print('Categorical DQN agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('Categorical DQN agent (multi-state/action) passed = {}'.format( passed)) self.assertTrue(passed >= 2)
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', default='DQNAgent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode") # parser.add_argument('-m', '--monitor', help="Save results to this directory") # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() env = OpenAIUniverse(args.gym_id) env.configure(remotes=1) default = dict( repeat_actions=1, actions=env.actions, states=env.states, max_episode_length=args.max_timesteps ) if args.agent_config: config = Configuration.from_json(args.agent_config) else: config = Configuration() config.default(default) if args.network_config: network_config = Configuration.from_json(args.network_config).network_layers else: if config.network_layers: network_config = config.network_layers else: raise TensorForceError("Error: No network configuration provided.") if args.debug: print("Configuration:") print(config) logger = logging.getLogger(__name__) logger.setLevel(log_levels[config.loglevel]) stack = None agent = create_agent(args.agent, config, network_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(config) runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner.save_model(args.save, args.save_episodes) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) if args.monitor: env.gym.monitor.close() env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-w', '--num-workers', type=int, default=1, help="Number of worker agents") parser.add_argument('-m', '--monitor', help="Save results to this file") parser.add_argument('-M', '--mode', choices=['tmux', 'child'], default='tmux', help="Starter mode") parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") parser.add_argument('-C', '--is-child', action='store_true') parser.add_argument('-i', '--task-index', type=int, default=0, help="Task index") parser.add_argument('-K', '--kill', action='store_true', default=False, help="Kill runners") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() session_name = 'openai_async' shell = '/bin/bash' kill_cmds = [ "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers), "tmux kill-session -t {}".format(session_name), ] if args.kill: os.system("\n".join(kill_cmds)) return 0 if not args.is_child: # start up child processes target_script = os.path.abspath(inspect.stack()[0][1]) def wrap_cmd(session, name, cmd): if isinstance(cmd, list): cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) if args.mode == 'tmux': return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd)) elif args.mode == 'child': return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( cmd, args.logdir, session, name, args.logdir ) def build_cmd(index): cmd_args = [ 'CUDA_VISIBLE_DEVICES=', sys.executable, target_script, args.gym_id, '--is-child', '--agent', args.agent, '--agent-config', os.path.join(os.getcwd(), args.agent_config), '--network-config', os.path.join(os.getcwd(), args.network_config), '--num-workers', args.num_workers, '--task-index', index ] if args.debug: cmd_args.append('--debug') return cmd_args if args.mode == 'tmux': cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)] elif args.mode == 'child': cmds = ['mkdir -p {}'.format(args.logdir), 'rm -f {}/kill.sh'.format(args.logdir), 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), 'chmod +x {}/kill.sh'.format(args.logdir)] cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1))) for i in xrange(args.num_workers): name = 'w_{}'.format(i) if args.mode == 'tmux': cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell)) cmds.append(wrap_cmd(session_name, name, build_cmd(i))) # add one PS call # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) print("\n".join(cmds)) os.system("\n".join(cmds)) return 0 ps_hosts = ['127.0.0.1:{}'.format(12222)] worker_hosts = [] port = 12223 for _ in range(args.num_workers): worker_hosts.append('127.0.0.1:{}'.format(port)) port += 1 cluster = {'ps': ps_hosts, 'worker': worker_hosts} cluster_spec = tf.train.ClusterSpec(cluster) environment = OpenAIGym(args.gym_id) if args.agent_config: agent_config = Configuration.from_json(args.agent_config) else: raise TensorForceError("No agent configuration provided.") if not args.network_config: raise TensorForceError("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(args.network_config))) agent_config.default(dict(distributed=True, cluster_spec=cluster_spec, global_model=(args.task_index == -1), device=('/job:ps' if args.task_index == -1 else '/job:worker/task:{}/cpu:0'.format(args.task_index)))) logger = logging.getLogger(__name__) logger.setLevel(log_levels[agent_config.loglevel]) agent = agents[args.agent](config=agent_config) logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id)) logger.info("Config:") logger.info(agent_config) runner = Runner( agent=agent, environment=environment, repeat_actions=1, cluster_spec=cluster_spec, task_index=args.task_index ) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-m', '--monitor', help="Save results to this directory") parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # configurable!!! environment = OpenAIGym(args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video) if args.agent_config: agent_config = Configuration.from_json(args.agent_config) else: agent_config = Configuration() logger.info("No agent configuration provided.") if args.network_config: network = from_json(args.network_config) else: network = None logger.info("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=network)) agent = agents[args.agent](config=agent_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner = Runner( agent=agent, environment=environment, repeat_actions=1, save_path=args.save, save_episodes=args.save_episodes ) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) if args.monitor: environment.gym.monitor.close() environment.close()