示例#1
0
def dqn_train(args):
    seasonals = (args.environment == 'seasonals-v1')
    save_dir = os.path.join(args.folder, args.agent_name)
    train_env = EnvWrap(gym.make('seasonals-v1'), batched=False, 
            subep_len=252, num_subeps=5
            ) if seasonals else OpenAIGym(env_name)
    test_env = EnvWrap(gym.make('seasonals-v1')
            ) if seasonals else OpenAIGym(env_name, 
                    monitor_video=1, 
                    monitor=os.path.join(save_dir, 'monitoring')) 

    agent = setup_agent(train_env.states, train_env.actions, int(layer_1_size), 
            int(layer_2_size), layer_1_activation, layer_2_activation, 
            True if has_third_layer=='True' else False,
            float(learning_rate), float(baseline_learning_rate),
            save_dir=save_dir)
    
    rewards, test_rewards, test_episodes = train(
            agent, train_env, num_episodes=args.num_episodes)
    agent.close()
    train_env.close()
    plot_rewards(rewards, test_rewards=test_rewards, 
            test_episodes=test_episodes)
    loss, history = test(agent, test_env)
    graph_episode(history)
示例#2
0
def _generate_episode_data(episode_id, gym_id, monitor):
    if episode_id % 100 == 0:
        print('Computing game', episode_id)

    try:
        environment = OpenAIGym(
            gym_id=gym_id,
            monitor=monitor if episode_id == 0 else None,
            monitor_video=1 if episode_id == 0 else 0
        )
        state = environment.reset()
        world = environment.gym.unwrapped.world
        interface = VincentSalimInterface()
        interface.start(world)
        episode = []
        while True:
            interface.feed(world)
            action = interface.get_moves(world, 0)
            episode.append((state, action))
            state, terminal, step_reward = environment.execute(action)
            if terminal:
                break
        return episode
    except Exception as e:
        print('An exception occurred during game generation!', e)
        return []
示例#3
0
def main():
    env = OpenAIGym("P3DX-v0")

    agent = DQNAgent(states=dict(type='float', shape=(80, 80, 4)),
                     actions=dict(type='int', num_actions=7),
                     network=[
                         dict(type="conv2d",
                              size=16,
                              window=[8, 8],
                              stride=4,
                              activation="relu"),
                         dict(type="conv2d",
                              size=32,
                              window=[4, 4],
                              stride=2,
                              activation="relu"),
                         dict(type="flatten"),
                         dict(type="dense", size=256)
                     ],
                     actions_exploration=dict(type="epsilon_decay",
                                              initial_epsilon=1.0,
                                              final_epsilon=0.1,
                                              timesteps=1000),
                     memory=dict(type="replay",
                                 capacity=1000,
                                 include_next_states=True),
                     update_mode=dict(unit="timesteps",
                                      batch_size=16,
                                      frequency=4),
                     discount=0.99,
                     entropy_regularization=None,
                     double_q_model=True,
                     optimizer=dict(type="adam", learning_rate=1e-4))

    try:
        agent.restore_model(directory="modelo/", file="data-129235")
        print("Found data!")
    except Exception as e:
        print(e)
        print("Can't load data")

    print("Starting execution")
    state = env.reset()
    agent.reset()
    try:
        while True:
            # Get action - no exploration and no observing
            action = agent.act(state, deterministic=True, independent=True)
            print(action)

            # Execute action in the environment
            state, terminal_state, reward = env.execute(action)

            if terminal_state:
                raise KeyboardInterrupt
    except KeyboardInterrupt:
        print("Terminal state", terminal_state)
        state = env.reset()
        agent.reset()
示例#4
0
def main():
    #tensorforce
    env = OpenAIGym('JacoArm-v0')

    agent = TRPOAgent(states_spec=env.states,
                      actions_spec=env.actions,
                      network_spec=network_spec,
                      batch_size=512)

    # agent = PPOAgent(
    # 	states_spec=env.states,
    # 	actions_spec=env.actions,
    # 	network_spec=network_spec,
    # 	batch_size=512,
    # 	step_optimizer=dict(
    # 		type='adam',
    # 		learning_rate=1e-4
    # 	)
    # )

    runner = Runner(agent=agent, environment=env)

    raw_input("hit enter when gazebo is loaded...")
    print()
    env.gym.unpause()
    env.gym.hold_init_robot_pos([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0])
    runner.run(episodes=1500,
               max_episode_timesteps=1000,
               episode_finished=episode_finished)

    #old-fashioned way
    # env = gym.make('JacoArm-v0')
    # print "launching the world..."
    # #gz loaing issues, let user start the learning
    # raw_input("hit enter when gazebo is loaded...")
    # env.set_physics_update(0.0001, 10000)
    # raw_input("hit enter when gazebo is loaded...")

    # # env.set_goal([0.167840578046, 0.297489331432, 0.857454500127])

    # total_episodes = 100
    # action = [1,1,1,1,1,1,1,1,1,1]
    # x = 0
    # # for x in range(total_episodes):
    # while True:
    # 	# if x % 10 is 0:
    # 	action = numpy.random.rand(1, 10)[0]
    # 		# print 'new action is', action

    # 	state, reward, done, _ = env.step(action)
    # 	print reward
    # 	time.sleep(0.2)
    # 	x += 1

    write_to_csv(train_data, 'test.csv')
    env.close()
示例#5
0
    def __init__(self, game, state=None):
        self.game = game
        if state is None:
            self.gym = retro.make(game)
        else:
            self.gym = retro.make(game, state=state)
        self.visualize = False

        self.states_spec = OpenAIGym.specs_from_gym_space(
            space=self.gym.observation_space, ignore_value_bounds=True)
        self.actions_spec = OpenAIGym.specs_from_gym_space(
            space=self.gym.action_space, ignore_value_bounds=False)
示例#6
0
def main():
    gym_id = 'CartPole-v0'
    max_episodes = 10000
    max_timesteps = 1000

    env = OpenAIGym(gym_id)
    network_spec = [
        dict(type='dense', size=32, activation='tanh'),
        dict(type='dense', size=32, activation='tanh')
    ]

    agent = DQNAgent(
        states_spec=env.states,
        actions_spec=env.actions,
        network_spec=network_spec,
        batch_size=64
    )

    runner = Runner(agent, env)
    
    report_episodes = 10

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logging.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logging.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logging.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))

    runner.run(max_episodes, max_timesteps, episode_finished=episode_finished)
    
    print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
示例#7
0
    def states(self) -> Tuple[TensorForceStateType, TensorForceStateShape]:
        """The state space specification, required for `tensorforce` agents.

        The tuple contains the following attributes:
            - type: Either 'bool', 'int', or 'float'.
            - shape: The shape of the space. An `int` or `list`/`tuple` of `int`s.
        """
        from tensorforce.contrib.openai_gym import OpenAIGym
        return OpenAIGym.state_from_space(self.observation_space)
    def actions(self) -> Tuple[TensorForceStateType, TensorForceStateShape, int, TensorForceMinMaxValue, TensorForceMinMaxValue]:
        """The action space specification, required for `tensorforce` agents.

        The tuple contains the following attributes:
            - type: Either 'bool', 'int', or 'float'.
            - shape: The shape of the space. An `int` or `list`/`tuple` of `int`s.
            - num_actions (required if type == 'int'): The number of discrete actions.
            - min_value (optional if type == 'float'): An `int` or `float`. Defaults to `None`.
            - max_value (optional if type == 'float'): An `int` or `float`. Defaults to `None`.
        """
        from tensorforce.contrib.openai_gym import OpenAIGym
        return OpenAIGym.action_from_space(self.action_space)
    def test_example(self):
        sys.stdout.write('\nQuickstart:\n')
        sys.stdout.flush()

        passed = 0
        for _ in xrange(3):

            # Create an OpenAI-Gym environment
            environment = OpenAIGym('CartPole-v0')

            # Network specification for the model
            network_spec = [
                dict(type='dense', size=32),
                dict(type='dense', size=32)
            ]

            # Create the agent
            agent = PPOAgent(states_spec=environment.states,
                             actions_spec=environment.actions,
                             network_spec=network_spec,
                             batch_size=4000,
                             step_optimizer=dict(type='adam',
                                                 learning_rate=1e-2),
                             optimization_steps=5,
                             discount=0.99,
                             normalize_rewards=False,
                             entropy_regularization=0.01,
                             likelihood_ratio_clipping=0.2)

            # Initialize the runner
            runner = Runner(agent=agent, environment=environment)

            # Function handle called after each finished episode
            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                mean_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or mean_reward < 50.0

            # Start the runner
            runner.run(episodes=2000,
                       max_episode_timesteps=200,
                       episode_finished=episode_finished)

            sys.stdout.write('episodes: {}\n'.format(runner.episode))
            sys.stdout.flush()

            # Test passed if episode_finished handle evaluated to False
            if runner.episode < 2000:
                passed += 1

        sys.stdout.write('==> passed: {}\n'.format(passed))
        sys.stdout.flush()
        self.assertTrue(passed >= 2)
示例#10
0
    def execute(self, actions):
        flat, hydrated, network = self.get_hypers(actions)

        env = OpenAIGym('CartPole-v0', visualize=True)
        env.viewer = None
        agent = agents_dict[self.agent](states_spec=env.states,
                                        actions_spec=env.actions,
                                        network_spec=network,
                                        **hydrated)

        # n_train, n_test = 2, 1
        n_train, n_test = 250, 30
        runner = Runner(agent=agent, environment=env)
        runner.run(episodes=n_train)  # train
        runner.run(episodes=n_test, deterministic=True)  # test
        # You may need to remove runner.py's close() calls so you have access to runner.episode_rewards, see
        # https://github.com/lefnire/tensorforce/commit/976405729abd7510d375d6aa49659f91e2d30a07

        # I personally save away the results so I can play with them manually w/ scikitlearn & SQL
        rewards = runner.episode_rewards
        reward = np.mean(rewards[-n_test:])
        print(flat, f"\nReward={reward}\n\n")

        sql = """
          INSERT INTO runs (hypers, reward_avg, rewards, agent, flag)
          VALUES (:hypers, :reward_avg, :rewards, :agent, :flag)
        """
        try:
            self.conn.execute(text(sql),
                              hypers=json.dumps(flat),
                              reward_avg=reward,
                              rewards=rewards,
                              agent='ppo_agent',
                              flag=self.net_type)
        except Exception as e:
            pdb.set_trace()

        runner.close()
        return reward
示例#11
0
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=256,

                memory=dict(
                    type='prioritized_replay',
                ),
                update_frequency=256,
                first_update=512,

                learning_rate=0.0001,
                optimizer_batch_size=64,
                normalize_rewards=False,
                gae_rewards=False,
                baseline=dict(
                    type="mlp",
                    sizes=[32, 32],
                    epochs=1,
                    update_batch_size=64,
                    learning_rate=0.001
                ),
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
示例#12
0
文件: eval.py 项目: Wellan89/rl-board
def _generate_episode_data(episode_id, gym_id, model_data, versus_model_data,
                           monitor):
    try:
        environment = OpenAIGym(gym_id=gym_id,
                                monitor=monitor if episode_id == 0 else None,
                                monitor_video=1 if episode_id == 0 else 0)
        unwrapped_gym = environment.gym.unwrapped
        predictor = OpponentPredictor(env=unwrapped_gym, **model_data)
        if versus_model_data:
            unwrapped_gym.set_opponent_factory(lambda: OpponentPredictor(
                env=unwrapped_gym, **versus_model_data))
        state = environment.reset()
        reward = 0.0
        while True:
            action = predictor(state)
            state, terminal, step_reward = environment.execute(action)
            reward += step_reward
            if terminal:
                break
        return reward
    except Exception as e:
        print('An exception occurred during game generation!', e)
        return 0.0
示例#13
0
    def __init__(self,
                 rng: Union[int, np.random.RandomState, None] = None,
                 defaults: Union[Dict, None] = None,
                 max_episodes: Union[int, None] = 3000):
        """
        Base benchmark for "cartpole" benchmark. In this benchmark a PPO agent tries to solve the cartpole task.

        Parameters
        ----------
        rng : int,None,np.RandomState
            RandomState for the experiment
        defaults : dict, None
            default configuration used for the PPO agent
        max_episodes : int, None
            limit of the length of a episode for the cartpole runner. Defaults to 3000
        """

        logger.warning('This Benchmark is not deterministic.')
        super(CartpoleBase, self).__init__()

        self.rng = rng_helper.get_rng(rng=rng)
        tf.random.set_random_seed(0)
        np.random.seed(0)
        self.env = OpenAIGym('CartPole-v0', visualize=False)
        self.avg_n_episodes = 20
        self.max_episodes = max_episodes

        self.defaults = {
            "n_units_1": 64,
            "n_units_2": 64,
            "batch_size": 64,
            "learning_rate": 1e-3,
            "discount": 0.99,
            "likelihood_ratio_clipping": 0.2,
            "activation_1": "tanh",
            "activation_2": "tanh",
            "optimizer_type": "adam",
            "optimization_steps": 10,
            "baseline_mode": "states",
            "baseline_n_units_1": 64,
            "baseline_n_units_2": 64,
            "baseline_learning_rate": 1e-3,
            "baseline_optimization_steps": 10,
            "baseline_optimizer_type": "adam"
        }

        if defaults is not None:
            self.defaults.update(defaults)
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = TRPOAgent(config=Configuration(
                log_level='info',
                batch_size=100,
                baseline=dict(
                    type='mlp',
                    size=32,
                    hidden_layers=1,
                    epochs=20,
                    update_batch_size=32
                ),
                generalized_advantage_estimation=True,
                normalize_advantage=False,
                gae_lambda=0.97,
                max_kl_divergence=0.005,
                cg_iterations=20,
                cg_damping=0.01,
                ls_max_backtracks=20,
                ls_override=False,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
示例#15
0
def experiment(args, env_name, base_agent="agent.json", 
        agent_folder=None, visualize=True, num_episodes=1000):
    
    seasonals = (env_name=="seasonals-v1")

    train_env = OpenAIGym(env_name) \
            if not seasonals else EnvWrap(
                    gym.make('seasonals-v1'), batched=True,
                    subep_len=252, num_subeps=5)
    test_env = OpenAIGym(env_name, monitor_video=1, 
            monitor=os.path.join(agent_folder, "monitor")) \
                if not seasonals else EnvWrap(gym.make('seasonals-v1'))

    agent = setup_agent(train_env.states, train_env.actions, args, 
            save_dir=agent_folder, base_agent_file=base_agent)

    rewards, test_episodes, test_rewards = train(
            agent, train_env, num_episodes=num_episodes, 
            test_env=train_env)

    train_env.close()
    if visualize:
        plot_rewards(rewards, 
                test_episodes=test_episodes,
                test_rewards=test_rewards,
                save_dir=agent_folder)
    reward, history = test(agent, test_env, start_index=(
        test_env.first_trading_day + 252 * 5 if seasonals else None))

    graph_episode(history, 
            save_path=os.path.join(agent_folder, "test.png"))
    test_env.close()
    agent.close()
    experiment_data = {"final_test_reward":reward,
            "test_average_last_50":np.mean(test_rewards[-10:]),
            "train_average_last_50":np.mean(rewards[-50:]),
            "test_average_last_10":np.mean(test_rewards[-2:]),
            "train_average_last_10":np.mean(rewards[-10:]),
            }
    experiment_data.update(args)
    return experiment_data
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=4096,
                gae_lambda=0.97,
                learning_rate=0.001,
                entropy_penalty=0.01,
                epochs=5,
                optimizer_batch_size=512,
                loss_clipping=0.2,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000,
                       max_timesteps=200,
                       episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
示例#17
0
import numpy as np
import time

from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym
import tensorflow as tf

cluster = {'ps': ['127.0.0.1:12222'], 'worker': ['127.0.0.1:12223']}
cluster_spec = tf.train.ClusterSpec(cluster)

# Create an OpenAIgym environment
# ReversedAddition-v0
# CartPole-v0
env = OpenAIGym('CartPole-v0', visualize=True)

# Network as list of layers
network_spec = [
    dict(type='dense', size=32, activation='relu'),
    dict(type='dense', size=32, activation='relu')
]

distributed_spec = dict(cluster_spec=cluster_spec,
                        task_index=0,
                        device=('/job:worker'))

agent = PPOAgent(
    states_spec=env.states,
    actions_spec=env.actions,
    network_spec=network_spec,
示例#18
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import numpy as np

from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

# Create an OpenAIgym environment.
environment = OpenAIGym('CartPole-v0', visualize=False)

# Network as list of layers
# - Embedding layer:
#   - For Gym environments utilizing a discrete observation space, an
#     "embedding" layer should be inserted at the head of the network spec.
#     Such environments are usually identified by either:
#     - class ...Env(discrete.DiscreteEnv):
#     - self.observation_space = spaces.Discrete(...)

# Note that depending on the following layers used, the embedding layer *may* need a
# flattening layer

network_spec = [
    # dict(type='embedding', indices=100, size=32),
    # dict(type'flatten'),
示例#19
0
    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('drone_training')
    outdir = pkg_path + '/training_results'

    rospy.loginfo("Monitor Wrapper started")

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    environment = OpenAIGym(gym_id='QuadcopterLiveShow-v0',
                            monitor='output',
                            monitor_safe=False,
                            monitor_video=False,
                            visualize=True)
    print os.getcwd()
    with open(
            '/root/catkin_ws/src/drone_training/drone_training/configs/dqn_ue4.json',
            'r') as fp:
        agent = json.load(fp=fp)

    with open(
            '/root/catkin_ws/src/drone_training/drone_training/configs/mynet.json',
            'r') as fp:
        network = json.load(fp=fp)

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(
示例#20
0
# limitations under the License.
# ==============================================================================

import numpy as np

from tensorforce.agents import RandomAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

import sc2gym
from absl import flags
FLAGS = flags.FLAGS
FLAGS([__file__])

# Create an OpenAIgym environment
env = OpenAIGym('SC2CollectMineralShards-v2', visualize=False)

agent = RandomAgent(
    states_spec=env.states,
    actions_spec=env.actions,
)
# Create the runner
runner = Runner(agent=agent, environment=env)

# Callback function printing episode statistics
rewards = []


def episode_finished(r):
    print(
        "Finished episode {ep} after {ts} timesteps (reward: {reward})".format(
示例#21
0
import numpy as np
import json

from tensorforce.agents import Agent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

# Create an OpenAIgym environment
env = OpenAIGym('Pendulum-v0', visualize=False)

network_path = './pendulum_ppo_network.json'
agent_path = './pendulum_ppo.json'
with open(network_path, 'r') as fp:
    network_spec = json.load(fp=fp)
with open(agent_path, 'r') as fp:
    agent_config = json.load(fp=fp)
agent = Agent.from_spec(spec=agent_config,
                        kwargs=dict(states=env.states,
                                    actions=env.actions,
                                    network=network_spec))

# Create the runner
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):
    print(
        "Finished episode {ep} after {ts} timesteps (reward: {reward})".format(
            ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1]))
    return True
示例#22
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-config',
                        help="Network configuration file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=50000,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--max-timesteps',
                        type=int,
                        default=2000,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-w',
                        '--num-workers',
                        type=int,
                        default=1,
                        help="Number of worker agents")
    parser.add_argument('-m', '--monitor', help="Save results to this file")
    parser.add_argument('-M',
                        '--mode',
                        choices=['tmux', 'child'],
                        default='tmux',
                        help="Starter mode")
    parser.add_argument('-L',
                        '--logdir',
                        default='logs_async',
                        help="Log directory")
    parser.add_argument('-C', '--is-child', action='store_true')
    parser.add_argument('-i',
                        '--task-index',
                        type=int,
                        default=0,
                        help="Task index")
    parser.add_argument('-K',
                        '--kill',
                        action='store_true',
                        default=False,
                        help="Kill runners")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    session_name = 'openai_async'
    shell = '/bin/bash'

    kill_cmds = [
        "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(
            12222 + args.num_workers),
        "tmux kill-session -t {}".format(session_name),
    ]
    if args.kill:
        os.system("\n".join(kill_cmds))
        return 0

    if not args.is_child:
        # start up child processes
        target_script = os.path.abspath(inspect.stack()[0][1])

        def wrap_cmd(session, name, cmd):
            if isinstance(cmd, list):
                cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd)
            if args.mode == 'tmux':
                return 'tmux send-keys -t {}:{} {} Enter'.format(
                    session, name, shlex_quote(cmd))
            elif args.mode == 'child':
                return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format(
                    cmd, args.logdir, session, name, args.logdir)

        def build_cmd(index):
            cmd_args = [
                'CUDA_VISIBLE_DEVICES=', sys.executable, target_script,
                args.gym_id, '--is-child', '--agent', args.agent,
                '--agent-config',
                os.path.join(os.getcwd(),
                             args.agent_config), '--network-config',
                os.path.join(os.getcwd(), args.network_config),
                '--num-workers', args.num_workers, '--task-index', index
            ]
            if args.debug:
                cmd_args.append('--debug')
            return cmd_args

        if args.mode == 'tmux':
            cmds = kill_cmds + [
                'tmux new-session -d -s {} -n ps'.format(session_name)
            ]
        elif args.mode == 'child':
            cmds = [
                'mkdir -p {}'.format(args.logdir),
                'rm -f {}/kill.sh'.format(args.logdir),
                'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir),
                'chmod +x {}/kill.sh'.format(args.logdir)
            ]
        cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1)))

        for i in xrange(args.num_workers):
            name = 'w_{}'.format(i)
            if args.mode == 'tmux':
                cmds.append('tmux new-window -t {} -n {} -d {}'.format(
                    session_name, name, shell))
            cmds.append(wrap_cmd(session_name, name, build_cmd(i)))

        # add one PS call
        # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell))

        print("\n".join(cmds))

        os.system("\n".join(cmds))

        return 0

    ps_hosts = ['127.0.0.1:{}'.format(12222)]
    worker_hosts = []
    port = 12223
    for _ in range(args.num_workers):
        worker_hosts.append('127.0.0.1:{}'.format(port))
        port += 1
    cluster = {'ps': ps_hosts, 'worker': worker_hosts}
    cluster_spec = tf.train.ClusterSpec(cluster)

    environment = OpenAIGym(args.gym_id)

    if args.agent_config:
        agent_config = Configuration.from_json(args.agent_config)
    else:
        raise TensorForceError("No agent configuration provided.")
    if not args.network_config:
        raise TensorForceError("No network configuration provided.")
    agent_config.default(
        dict(states=environment.states,
             actions=environment.actions,
             network=from_json(args.network_config)))

    agent_config.default(
        dict(distributed=True,
             cluster_spec=cluster_spec,
             global_model=(args.task_index == -1),
             device=('/job:ps' if args.task_index == -1 else
                     '/job:worker/task:{}/cpu:0'.format(args.task_index))))

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[agent_config.log_level])

    agent = agents[args.agent](config=agent_config)

    logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(
        gym_id=args.gym_id))
    logger.info("Config:")
    logger.info(agent_config)

    runner = Runner(agent=agent,
                    environment=environment,
                    repeat_actions=1,
                    cluster_spec=cluster_spec,
                    task_index=args.task_index)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            sps = r.total_timesteps / (time.time() - r.start_time)
            logger.info(
                "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}"
                .format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    runner.run(args.episodes,
               args.max_timesteps,
               episode_finished=episode_finished)
示例#23
0
import numpy as np

from tensorforce.agents import PPOAgent, RandomAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

# Create an OpenAIgym environment
# ReversedAddition-v0
# CartPole-v0
env = OpenAIGym('ReversedAddition-v0', visualize=False)

print(env.gym.observation_space)
print(env.gym.action_space)

# Network as list of layers
network_spec = [
    dict(type='embedding', size=32, indices=100),
    dict(type='dense', size=32),
    dict(type='dense', size=32)
]

agent = PPOAgent(
    states_spec=env.states,
    actions_spec=env.actions,
    network_spec=network_spec,
    batch_size=4096,
    # Agent
    preprocessing=None,
    exploration=None,
    reward_preprocessing=None,
    # BatchAgent

# set the network layout
network_spec = [
    dict(type='dense', size=64),
    dict(type='dense', size=32),
    dict(type='dense', size=32)
]

for memory_type in memory_types:
    #create filename
    fn = 'Acrobot_10k_' + str(model_type) + '_' + str(memory_type) + '.pkl'
    print(fn)
    d1 = datetime.datetime.now()
    # set the breakout atari environment
    environment = OpenAIGym('Acrobot-v1', visualize=False)
    #define the memory and model types
    memory = define_memory(memory_type)
    double_model = define_model(model_type)
    # create the agent
    agent = create_agent(memory, double_model, environment)
    # create the runner
    runner = Runner(agent=agent, environment=environment)
    # teach the agent
    runner.run(episodes=10000, episode_finished=episode_finished)
    runner.close()
    # Print statistics
    print(
        "Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}."
        .format(ep=runner.episode, ar=np.mean(runner.episode_rewards[-100:])))
    # print time taken
示例#25
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        help="Choose actions deterministically")
    parser.add_argument('-M',
                        '--mode',
                        choices=('tmux', 'child'),
                        default='tmux',
                        help="Starter mode")
    parser.add_argument('-W',
                        '--num-workers',
                        type=int,
                        default=1,
                        help="Number of worker agents")
    parser.add_argument('-C',
                        '--child',
                        action='store_true',
                        help="Child process")
    parser.add_argument('-P',
                        '--parameter-server',
                        action='store_true',
                        help="Parameter server")
    parser.add_argument('-I',
                        '--task-index',
                        type=int,
                        default=0,
                        help="Task index")
    parser.add_argument('-K',
                        '--kill',
                        action='store_true',
                        help="Kill runners")
    parser.add_argument('-L',
                        '--logdir',
                        default='logs_async',
                        help="Log directory")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        help="Show debug outputs")

    args = parser.parse_args()

    session_name = 'OpenAI-' + args.gym_id
    shell = '/bin/bash'

    kill_cmds = [
        "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(
            12222 + args.num_workers),
        "tmux kill-session -t {}".format(session_name),
    ]
    if args.kill:
        os.system("\n".join(kill_cmds))
        return 0

    if not args.child:
        # start up child processes
        target_script = os.path.abspath(inspect.stack()[0][1])

        def wrap_cmd(session, name, cmd):
            if isinstance(cmd, list):
                cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd)
            if args.mode == 'tmux':
                return 'tmux send-keys -t {}:{} {} Enter'.format(
                    session, name, shlex_quote(cmd))
            elif args.mode == 'child':
                return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format(
                    cmd, args.logdir, session, name, args.logdir)

        def build_cmd(ps, index):
            cmd_args = [
                'CUDA_VISIBLE_DEVICES=', sys.executable, target_script,
                args.gym_id, '--agent',
                os.path.join(os.getcwd(), args.agent), '--network',
                os.path.join(os.getcwd(), args.network), '--num-workers',
                args.num_workers, '--child', '--task-index', index
            ]
            if args.episodes is not None:
                cmd_args.append('--episodes')
                cmd_args.append(args.episodes)
            if args.timesteps is not None:
                cmd_args.append('--timesteps')
                cmd_args.append(args.timesteps)
            if args.max_episode_timesteps is not None:
                cmd_args.append('--max-episode-timesteps')
                cmd_args.append(args.max_episode_timesteps)
            if args.deterministic:
                cmd_args.append('--deterministic')
            if ps:
                cmd_args.append('--parameter-server')
            if args.debug:
                cmd_args.append('--debug')
            return cmd_args

        if args.mode == 'tmux':
            cmds = kill_cmds + [
                'tmux new-session -d -s {} -n ps'.format(session_name)
            ]
        elif args.mode == 'child':
            cmds = [
                'mkdir -p {}'.format(args.logdir),
                'rm -f {}/kill.sh'.format(args.logdir),
                'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir),
                'chmod +x {}/kill.sh'.format(args.logdir)
            ]

        cmds.append(wrap_cmd(session_name, 'ps', build_cmd(ps=True, index=0)))

        for i in xrange(args.num_workers):
            name = 'worker{}'.format(i)
            if args.mode == 'tmux':
                cmds.append('tmux new-window -t {} -n {} -d {}'.format(
                    session_name, name, shell))
            cmds.append(
                wrap_cmd(session_name, name, build_cmd(ps=False, index=i)))

        # add one PS call
        # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell))

        print("\n".join(cmds))

        os.system("\n".join(cmds))

        return 0

    ps_hosts = ['127.0.0.1:{}'.format(12222)]
    worker_hosts = []
    port = 12223
    for _ in range(args.num_workers):
        worker_hosts.append('127.0.0.1:{}'.format(port))
        port += 1
    cluster = {'ps': ps_hosts, 'worker': worker_hosts}
    cluster_spec = tf.train.ClusterSpec(cluster)

    environment = OpenAIGym(args.gym_id)

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)  # log_levels[agent.log_level])

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    if args.parameter_server:
        agent['device'] = '/job:ps/task:{}'.format(args.task_index)  # '/cpu:0'
    else:
        agent['device'] = '/job:worker/task:{}'.format(
            args.task_index)  # '/cpu:0'

    agent['execution'] = dict(
        type='distributed',
        distributed_spec=dict(cluster_spec=cluster_spec,
                              task_index=args.task_index,
                              job='ps' if args.parameter_server else 'worker',
                              protocol='grpc'))

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(states=environment.states,
                                        actions=environment.actions,
                                        network=network))

    logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(
        gym_id=args.gym_id))
    logger.info("Config:")
    logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {} after overall {} timesteps. Steps Per Second {}"
                .format(r.agent.episode, r.agent.timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        return True

    runner.run(timesteps=args.timesteps,
               episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)
    runner.close()
示例#26
0
import numpy as np
import time
import matplotlib.pyplot as plt

from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

env = OpenAIGym('MountainCar-v0', visualize=False)

network_spec = [
    dict(type='dense', size=16, activation='relu'),
    dict(type='dense', size=16, activation='relu'),
    dict(type='dense', size=16, activation='relu')
]

agent = PPOAgent(
    states_spec=env.states,
    actions_spec=env.actions,
    network_spec=network_spec,
    batch_size=1024,
    # Agent
    # preprocessing=None,
    # exploration=None,
    # reward_preprocessing=None,
    # BatchAgent
    keep_last_timestep=True,
    # PPOAgent
    step_optimizer=dict(
        type='adam',
        learning_rate=1e-3
示例#27
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-i',
                        '--import-modules',
                        help="Import module(s) required for environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('--monitor', help="Save results to this directory")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled)")
    parser.add_argument('--visualize',
                        action='store_true',
                        default=False,
                        help="Enable OpenAI Gym's visualization")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")
    parser.add_argument('-te',
                        '--test',
                        action='store_true',
                        default=False,
                        help="Test agent without learning.")
    parser.add_argument(
        '-sl',
        '--sleep',
        type=float,
        default=None,
        help=
        "Slow down simulation by sleeping for x seconds (fractions allowed).")
    parser.add_argument(
        '--job',
        type=str,
        default=None,
        help="For distributed mode: The job type of this agent.")
    parser.add_argument(
        '--task',
        type=int,
        default=0,
        help="For distributed mode: The task index of this agent.")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    if args.import_modules is not None:
        for module in args.import_modules.split(','):
            importlib.import_module(name=module)

    environment = OpenAIGym(gym_id=args.gym_id,
                            monitor=args.monitor,
                            monitor_safe=args.monitor_safe,
                            monitor_video=args.monitor_video,
                            visualize=args.visualize)

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(
                                states=environment.states,
                                actions=environment.actions,
                                network=network,
                            ))

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}"
                .format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        if args.save and args.save_episodes is not None and not r.episode % args.save_episodes:
            logger.info("Saving agent to {}".format(args.save))
            r.agent.save_model(args.save)

        return True

    runner.run(num_timesteps=args.timesteps,
               num_episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished,
               testing=args.test,
               sleep=args.sleep)
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
示例#28
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('--monitor', help="Save results to this directory")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled)")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)

    environment = OpenAIGym(gym_id=args.gym_id,
                            monitor=args.monitor,
                            monitor_safe=args.monitor_safe,
                            monitor_video=args.monitor_video)

    if args.agent_config:
        config = Configuration.from_json(args.agent_config)
    else:
        config = Configuration()
        logger.info("No agent configuration provided.")

    if args.network_spec:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=args.agent,
                            kwargs=dict(states_spec=environment.states,
                                        actions_spec=environment.actions,
                                        network_spec=network_spec,
                                        config=config))

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(config)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: report per timestep?
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {} after {} timesteps. Steps Per Second {}".
                format(r.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    runner.run(timesteps=args.timesteps,
               episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)

    logger.info(
        "Learning finished. Total episodes: {ep}".format(ep=runner.episode))
示例#29
0
        print("NOT ENOUGH LOGGING INFO")
        print("Please write more about the changes and reasoning.")
        exit()

    with open(f"{TB_path}/README/README.txt", "w") as readme:
        start_time_ascii = time.asctime(time.localtime(time.time()))
        algorithm = os.path.basename(__file__)[:-2]
        print(f"Experiment start time: {start_time_ascii}", file=readme)
        print(f"\nAlgorithm:\n{algorithm}", file=readme)
        print(f"\nThe Changes:\n{changes}", file=readme)
        print(f"\nReasoning:\n{reasoning}", file=readme)
        print(f"\nHypothesis:\n{hypothesis}", file=readme)
        print(f"\nResults:\n", file=readme)

# Create an OpenAIgym environment.
environment = OpenAIGym('BizHawk-v0', visualize=False)
environment.gym.logging_folder_path = TB_path

# Network as list of layers
# - Embedding layer:
#   - For Gym environments utilizing a discrete observation space, an
#     "embedding" layer should be inserted at the head of the network spec.
#     Such environments are usually identified by either:
#     - class ...Env(discrete.DiscreteEnv):
#     - self.observation_space = spaces.Discrete(...)

# Note that depending on the following layers used, the embedding layer *may* need a
# flattening layer

# BREADCRUMBS_START
network_spec = [
示例#30
0
import numpy as np
import json

from tensorforce.agents import Agent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

# Create an OpenAIgym environment
env = OpenAIGym('MountainCar-v0', visualize=False)

network_path = './mountain_car_ppo_network.json'
agent_path = './mountain_car_ppo.json'
with open(network_path, 'r') as fp:
    network_spec = json.load(fp=fp)
with open(agent_path, 'r') as fp:
    agent_config = json.load(fp=fp)
agent = Agent.from_spec(
    spec=agent_config,
    kwargs=dict(
        states=env.states,
        actions=env.actions,
        network=network_spec
    )
)

# Create the runner
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):