def __init__( self, agent, environment, max_episode_timesteps=None, evaluation_environment=None, save_best_agent=None ): self.is_environment_external = isinstance(environment, Environment) self.environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps ) if evaluation_environment is None: self.evaluation_environment = None else: self.is_eval_environment_external = isinstance(evaluation_environment, Environment) self.evaluation_environment = Environment.create( environment=evaluation_environment, max_episode_timesteps=max_episode_timesteps ) assert self.evaluation_environment.states() == self.environment.states() assert self.evaluation_environment.actions() == self.environment.actions() self.is_agent_external = isinstance(agent, Agent) self.agent = Agent.create(agent=agent, environment=self.environment) self.save_best_agent = save_best_agent self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list()
def __init__(self, agent, environment, evaluation_environment=None, save_best_agent=False): # save_best overwrites saver... self.is_environment_external = isinstance(environment, Environment) self.environment = Environment.create(environment=environment) self.is_eval_environment_external = isinstance(evaluation_environment, Environment) if evaluation_environment is None: self.evaluation_environment = None else: self.evaluation_environment = Environment.create( environment=evaluation_environment) self.save_best_agent = save_best_agent self.is_agent_external = isinstance(agent, Agent) kwargs = dict() if self.save_best_agent is True: # Disable periodic saving assert not self.is_agent_external kwargs = dict(saver=dict(seconds=None, steps=None)) self.agent = Agent.create(agent=agent, environment=self.environment, **kwargs) # self.global_episodes = self.agent.episodes # self.global_timesteps = self.agent.timesteps # self.global_updates = self.agent.updates self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list()
def test_bad_initialization(): with pytest.raises(ValueError): Environment.create( environment=Bollux, seed_count=3, bad_seed_count=10, max_episode_length=100, )
def __init__(self, agent, environments, evaluation_environment=None, save_best_agent=False): # save_best overwrites saver... if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) self.is_environment_external = tuple( isinstance(environment, Environment) for environment in environments) self.environments = tuple( Environment.create(environment=environment) for environment in environments) self.is_eval_environment_external = isinstance(evaluation_environment, Environment) if evaluation_environment is None: self.evaluation_environment = None else: self.evaluation_environment = Environment.create( environment=evaluation_environment) self.save_best_agent = save_best_agent self.is_agent_external = isinstance(agent, Agent) kwargs = dict(parallel_interactions=len(environments)) # warning: save_best_agent if not self.is_agent_external and self.save_best_agent: # Disable periodic saving kwargs = dict(saver=dict(seconds=None, steps=None)) self.agent = Agent.create(agent=agent, environment=self.environments[0], **kwargs) if not self.agent.model.is_initialized: self.agent.initialize() # self.global_episodes = self.agent.episodes # self.global_timesteps = self.agent.timesteps # self.global_updates = self.agent.updates self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list() self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list()
def set_up(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=500, ) agent = Agent.create( agent="dqn", network=[ dict(type='flatten'), dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ], environment=bad_seeds_environment, batch_size=256, memory=int(10**7), exploration=0.15, summarizer=dict( directory="training_data/agent_02_env_02/summaries", labels="all", frequency=100 # store values every 100 timesteps )) return bad_seeds_environment, agent
def set_up(): tensorflow_settings() env = Environment.create(environment=CartSeed01, seed_count=10, bad_seed_count=3, max_count=20) agent = Agent.create( agent="a2c", batch_size=10000, horizon=50, discount=0.97, l2_regularization=0.1, variable_noise=0.5, environment=env, summarizer=dict( directory="training_data/a2c_cartseed/summaries", labels="all", frequency=10, ), # saver=dict( # directory='saved_models/agent_04_env_04_1000/checkpoints', # frequency=600 # save checkpoint every 600 seconds (10 minutes) # ), ) return env, agent
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, save_best_agent: bool = False, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=save_best_agent)
def runEnv(): environment = Environment.create( environment=CustomEnvironment, max_episode_timesteps=500 ) agent = Agent.create(agent='a2c', environment=environment, batch_size=10, learning_rate=1e-3) # Train for 200 episodes for _ in range(2000): states = environment.reset() terminal = False while CustomEnvironment.extraCounter != 100: actions = agent.act(states=states) # print(actions) # print(states) states, reward, terminal = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(1000): states = environment.reset() internals = agent.initial_internals() terminal = False while CustomEnvironment.extraCounter != 100: actions, internals = agent.act(states=states, internals=internals, independent=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward # print('Mean episode reward:', sum_rewards / 100) # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount) print(CustomEnvironment.sum) # Close agent and environment agent.close() environment.close()
def main(): # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='dqn', environment=environment, # memory=100, # # Optimization # batch_size=10, update_frequency=2, learning_rate=1e-3, summarizer=dict( directory='data/summaries', # list of labels, or 'all' labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'], frequency=100 # store values every 100 timesteps # (infrequent update summaries every update; other configurations possible) ), recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=10000) runner.close()
def prepare(self, environment=None, timestep_range=None, states=None, actions=None, exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, require_observe=False, require_all=False, **agent): """ Generic unit-test preparation. """ Layer.layers = None if environment is None: if states is None: states = deepcopy(self.__class__.states) if actions is None: actions = deepcopy(self.__class__.actions) if exclude_bool_action or self.__class__.exclude_bool_action: actions.pop('bool_action') if exclude_int_action or self.__class__.exclude_int_action: actions.pop('int_action') if exclude_float_action or self.__class__.exclude_float_action: actions.pop('float_action') if exclude_bounded_action or self.__class__.exclude_bounded_action: actions.pop('bounded_action') if timestep_range is None: timestep_range = self.__class__.timestep_range environment = UnittestEnvironment(states=states, actions=actions, timestep_range=timestep_range) elif timestep_range is not None: raise TensorforceError.unexpected() environment = Environment.create(environment=environment) for key, value in self.__class__.agent.items(): if key not in agent: agent[key] = value if self.__class__.require_all or require_all: config = None elif self.__class__.require_observe or require_observe: config = dict(api_functions=['reset', 'act', 'observe']) else: config = dict(api_functions=['reset', 'act']) agent = Agent.create(agent=agent, environment=environment, config=config) return agent, environment
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._save_best_agent = kwargs.get('save_best_agent', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create( agent=agent_spec, environment=self._environment, summarizer=dict( directory='data/summaries', labels=['graph', 'losses', 'rewards'], # list of labels, or 'all' frequency=100 # store values every 100 timesteps # (infrequent update summaries every update; other configurations possible) ), ) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=self._save_best_agent)
def __init__(self, in_dim, n_action, rl, train): super().__init__() self.make_in_port('observation', in_dim) self.make_in_port('reward', 1) self.make_in_port('done', 1) self.make_out_port('action', 1) self.make_in_port('token_in', 1) self.make_out_port('token_out', 1) self.n_action = n_action # number of action choices self.results['action'] = np.array([np.random.randint(n_action)]) self.model = None self.env_type = "MotorEnv" self.token = 0 self.prev_actions = 0 self.init = True self.in_dim = in_dim self.rl = rl if rl: self.env = Environment.create( environment=MotorComponent.MotorEnv, max_episode_timesteps=train["episode_count"] * train["max_steps"], n_action=n_action, obs_dim=in_dim, parent=self) self.env.reset() self.agent = Agent.create(agent=train['rl_agent'], environment=self.env)
def main(): bad_seeds_environment = Environment.create(environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100) agent = Agent.create( agent="a2c", batch_size=100, # this seems to help a2c horizon=20, # does this help a2c? exploration=0.01, # tried without this at first l2_regularization=0.1, entropy_regularization=0.2, variable_noise=0.05, environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_01_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=100000) agent.save(directory="saved_models")
def main(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="random", environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_random_env_02/summaries", labels="all", frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=10000) bad_seeds_environment.close() agent.close()
def set_up(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeedsSkinny, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="a2c", network=[ dict(type='flatten'), dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu') ], batch_size=10000, # changed for 04 but was this a mistake? no horizon=50, # changed from 100 to 50 for agent_04 discount=0.97, # new for agent_04 #exploration=0.05, # turned off for agent_04 - turn on for 05? l2_regularization=0.1, #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.5, # changed from 0.1 to 0.5 for agent_04 environment=bad_seeds_environment, summarizer=dict( directory="training_data/a2c_dense_skinny/summaries", # list of labels, or 'all' labels="all", frequency=100, # store values every 100 timesteps ), ) return bad_seeds_environment, agent
def main(): bad_seeds_environment = Environment.create( environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100 ) agent = Agent.create( agent="a2c", batch_size=100, horizon=100, # changed from 20 to 100 for agent_03 exploration=0.05, # changed from 0.01 to 0.05 for agent_03 l2_regularization=0.2, # changed from 0.1 to 0.2 for agent_03 #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.1, # changed from 0.05 to 0.1 for agent_03 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_03_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_03_env_03/checkpoints', frequency=600 # save checkpoint every 600 seconds (10 minutes) ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) for _ in range(10): runner.run(num_episodes=10000) runner.run(num_episodes=1000, evaluation=True) bad_seeds_environment.close() agent.close()
def test_quickstart(self): self.start_tests(name='quickstart') # ==================== # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='ppo', environment=environment, # Automatically configured network network='auto', # Optimization batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2, optimization_steps=5, # Reward estimation likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, # Critic critic_network='auto', critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), # Preprocessing preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, summarizer=None, recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=50, use_tqdm=False) runner.close() # ==================== self.finished_test()
def environment(self, environment: 'TradingEnvironment'): self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=self._save_best_agent)
def test_initialization(): bad_seeds_01_env = Environment.create(environment=BadSeeds01, seed_count=10, bad_seed_count=3, max_episode_length=100) assert bad_seeds_01_env.state.shape == (100, 10) assert len(bad_seeds_01_env.bad_seeds) == 3 assert len(bad_seeds_01_env.good_seeds) == 7
def _create_env(self) -> Environment: """Creates a tensorforce Environment encapsulating the underlying gym environment given in self.model_config""" self.log_api( 'Environment.create', f'(environment="gym", level={self.model_config.original_env_name})' ) result = Environment.create(environment='gym', level=self.model_config.gym_env_name) return result
def set_up( time_limit=100, batch_size=16, env_version=1, seed_count=10, max_count=10, ): """ Set up a rushed CartSeed agent with less time than it needs to complete an episode. Parameters ---------- time_limit : int, None Turn time limit for episode batch_size : int Batch size for training env_version : int in {1, 2} Environment version. 1 being ideal time, 2 being time limited seed_count : int Number of bad seeds max_count: int Maximum number of samples/scans needed to saturate a bad_seed Returns ------- Environment Agent """ def default_score(state, *args): return 1 if env_version == 1: environment = CartSeed( seed_count=seed_count, bad_seed_count=None, max_count=max_count, sequential=True, revisiting=True, bad_seed_reward_f=default_score, measurement_time=time_limit, ) elif env_version == 2: environment = CartSeedCountdown( seed_count=seed_count, bad_seed_count=None, max_count=max_count, sequential=True, revisiting=True, bad_seed_reward_f=default_score, measurement_time=time_limit, ) else: raise NotImplementedError env = Environment.create(environment=environment) agent = Agent.create(agent="a2c", batch_size=batch_size, environment=env) return env, agent
def runEnv(): environment = Environment.create(environment=CustomEnvironment, max_episode_timesteps=500) agent = Agent.create( agent='a2c', environment=environment, batch_size=10, learning_rate=1e-3, exploration=0.01, # tried without this at first variable_noise=0.05, # variable_noise=0.01 bad? l2_regularization=0.1, entropy_regularization=0.2, summarizer=dict( directory='data/summaries', # list of labels, or 'all' labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'], frequency=100, # store values every 100 timesteps )) # Train for 200 episodes for _ in range(CustomEnvironment.trainingEps): print("Episode: ", _) states = environment.reset() terminal = False while CustomEnvironment.extraCounter != CustomEnvironment.trials: actions = agent.act(states=states) # print(actions) # print(states) states, reward, terminal = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) print("bad seeds: ", CustomEnvironment.badseedsFinal) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(CustomEnvironment.testingEps): print("Episode: ", _ + CustomEnvironment.trainingEps) states = environment.reset() internals = agent.initial_internals() terminal = False while CustomEnvironment.extraCounter != CustomEnvironment.trials: actions, internals = agent.act(states=states, internals=internals, independent=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward print("bad seeds: ", CustomEnvironment.badseedsFinal) # print('Mean episode reward:', sum_rewards / 100) # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount) print(CustomEnvironment.sum) # Close agent and environment agent.close() environment.close()
def test_environment(self): self.start_tests(name='getting-started-environment') environment = Environment.create(environment='gym', level='CartPole', max_episode_timesteps=500) self.finished_test() environment = Environment.create(environment='gym', level='CartPole-v1') self.finished_test() environment = Environment.create( environment='test/data/environment.json', max_episode_timesteps=500) self.finished_test() environment = Environment.create( environment='test.data.custom_env.CustomEnvironment', max_episode_timesteps=10) self.finished_test()
def main(): # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') network = _create_network_specification((100, )) # Create a PPO agent agent = Agent.create(agent='dueling_dqn', environment=environment, network=network) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10000) runner.close()
def test_getting_started(self): from tensorforce.agents import Agent from tensorforce.environments import Environment # Setup environment # (Tensorforce or custom implementation, ideally using the Environment interface) environment = Environment.create( environment='test/data/environment.json') # Create and initialize agent agent = Agent.create(agent='test/data/agent.json', environment=environment) agent.initialize() # Reset agent and environment at the beginning of a new episode agent.reset() states = environment.reset() terminal = False # Agent-environment interaction training loop while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # ==================== # Agent-environment interaction evaluation loop while not terminal: actions = agent.act(states=states, evaluation=True) states, terminal, reward = environment.execute(actions=actions) # ==================== # Close agent and environment agent.close() environment.close() # ==================== from tensorforce.execution import Runner # Tensorforce runner utility runner = Runner(agent='test/data/agent.json', environment='test/data/environment.json') # Run training runner.run(num_episodes=50, use_tqdm=False) # Close runner runner.close() self.finished_test()
def main(): bad_seeds_environment = Environment.create(environment=Bollux, seed_count=10, bad_seed_count=3, max_episode_length=100) # 20200820-223031 # 20200820-233243 # batch_size 1000 goes not get smarter or dumber # batch_size 100 20200821-095410 gets dumber # try batch size 10000 ! agent = Agent.create( agent="a2c", batch_size=10000, # changed for 04 but was this a mistake? no horizon=50, # changed from 100 to 50 for agent_04 discount=0.97, # new for agent_04 #exploration=0.05, # turned off for agent_04 - turn on for 05? l2_regularization=0.1, #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.5, # changed from 0.1 to 0.5 for agent_04 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_04_bollux_1000000/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_04_bollux_1000000/checkpoints', frequency=6000 # save checkpoint every 6000 seconds (100 minutes) ), ) # this is the batch_size = 10000 version # I hope it is the last env 04 runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=1000000) #for i in range(100): # print("running 10000 episodes") # runner.run(num_episodes=10000) # print("saving the agent") # directory = Path(f"saved_models/agent_04_env_04_1000000/10000_{i}/checkpoints") # if directory.exists(): # directory.rmdir() # directory.mkdir(parents=True, exist_ok=True) # agent.save(directory=str(directory), format="numpy") bad_seeds_environment.close() agent.close()
def main(): from tensorforce.environments import Environment from bad_seeds.simple.bad_seeds_02 import BadSeeds02 from bad_seeds.simple.tf_utils import tensorflow_settings tensorflow_settings() env = Environment.create(environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=300) agent = DQN(env.states()['shape'], env.environment.seed_count, exploration=0.25) agent.play(env, 5000, verbose=True)
def default_cartseed(): """ This environment should lock into place the basics, with 10 seeds, each requiring Returns ------- Environment """ env = CartSeed( seed_count=10, bad_seed_count=3, frozen_order=True, ) env = Environment.create(environment=env) return env
def test_initialization(): bad_seeds_03_env = Environment.create(environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100) assert bad_seeds_03_env.history_array.shape == (100, 10) assert bad_seeds_03_env.state.shape == (7, 10) assert len(bad_seeds_03_env.bad_seeds) == 3 assert len(bad_seeds_03_env.good_seeds) == 7 measurement_count_per_seed, measurement_count = count_measurements( bad_seeds_03_env.history_array) assert np.all(measurement_count_per_seed == 3 * np.ones((1, 10))) # all seeds have been measured assert measurement_count == 10
def __init__(self, name='ppo_agent', load_model=None, env=None): """Initialization of an agent""" self.equity_alive = 0 self.actions = [] self.last_action_in_stage = '' self.temp_stack = [] self.name = name self.autoplay = True self.ppo_agent = None self.poker_env = Environment.create(environment=env, max_episode_timesteps=100) self.runner = None if load_model: self.load(load_model)