Exemplo n.º 1
0
    def __init__(
        self,
        states_spec,
        actions_spec,
        network_spec,
        device=None,
        session_config=None,
        scope='dqfd',
        saver_spec=None,
        summary_spec=None,
        distributed_spec=None,
        optimizer=None,
        discount=0.99,
        normalize_rewards=False,
        variable_noise=None,
        distributions_spec=None,
        entropy_regularization=None,
        target_sync_frequency=10000,
        target_update_weight=1.0,
        huber_loss=None,
        preprocessing=None,
        exploration=None,
        reward_preprocessing=None,
        batched_observe=1000,
        batch_size=32,
        memory=None,
        first_update=10000,
        update_frequency=4,
        repeat_update=1,
        expert_margin=0.5,
        supervised_weight=0.1,
        demo_memory_capacity=10000,
        demo_sampling_ratio=0.2
    ):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            device: Device string specifying model device.
            session_config: optional tf.ConfigProto with additional desired session configurations
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`.
                Available optimizer types include standard TensorFlow optimizers, `natural_gradient`,
                and `evolutionary`. Consult the optimizer test or example configurations for more.
            discount: Float specifying reward discount factor.
            normalize_rewards: Boolean flag specifying whether to normalize rewards, default False.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each
                preprocessor is a dict containing a type and optional necessary arguments.
            exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise)
                and arguments.
            reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size.
            memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`.
            first_update: Int describing at which time step the first update is performed. Should be larger
                than batch size.
            update_frequency: Int specifying number of observe steps to perform until an update is executed.
            repeat_update: Int specifying how many update steps are performed per update, where each update step implies
                sampling a batch from the memory and passing it to the model.
            expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other
                Q-values.
            supervised_weight: Weight of supervised loss term.
            demo_memory_capacity: Int describing capacity of expert demonstration memory.
            demo_sampling_ratio: Runtime sampling ratio of expert data.
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(
                type='adam',
                learning_rate=1e-3
            )
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(
                type='replay',
                capacity=100000
            )
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.session_config = session_config
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.huber_loss = huber_loss

        # DQFD always uses double dqn, which is a required key for a q-model.
        self.double_q_model = True
        self.target_sync_frequency = target_sync_frequency
        self.demo_memory_capacity = demo_memory_capacity
        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            preprocessing=preprocessing,
            exploration=exploration,
            reward_preprocessing=reward_preprocessing,
            batched_observe=batched_observe,
            batch_size=batch_size,
            memory=memory,
            first_update=first_update,
            update_frequency=update_frequency,
            repeat_update=repeat_update
        )
        self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)
Exemplo n.º 2
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('-a', '--agent-config', help="Agent configuration file")
    args = parser.parse_args()

    #From quickstart on docs
    #Network as list of layers
    #This is from mlp2_embedding_network.json
    network_spec = [
        {
            "type": "dense",
            "size":  32
#            "activation": "relu"
        },
        {
            "type": "dense",
            "size": 32
#            "activation": "relu"
        }
    ]

    DATAPATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    observedFile = os.path.join(DATAPATH,r"prnio.int")
    infoFile = os.path.join(DATAPATH,r"prnio.cfl")

    environment = PycrysfmlEnvironment(observedFile, infoFile)

    #get agent configuration
    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    agent = Agent.from_spec(
            spec=agent_config,
            kwargs=dict(
                states=environment.states,
                actions=environment.actions,
                network=network_spec,
            )
        )

    #Use this line to resore a pre-trained agent
    #agent.restore_model(file="/mnt/storage/deepQmodel_chisq")

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1
    )

    rewardsLog = []
    steps = []

    def episode_finished(r):

        if r.episode % 10 == 0:
            rewardsLog.append(r.episode_rewards[-1])
            steps.append(r.episode)

        if r.episode % 50 == 0:
            sps = r.timestep / (time.time() - r.start_time)
            file = open("/mnt/storage/trainingLog", "a")
            file.write("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}\n".format(ep=r.episode,
                                                                                                    ts=r.timestep,
                                                                                                    sps=sps))
            file.write("Episode reward: {}\n".format(r.episode_rewards[-1]))
            file.write("Episode timesteps: {}\n".format(r.episode_timestep))
            file.write("Average of last 500 rewards: {}\n".format(sum(r.episode_rewards[-500:]) / 500))
            file.write("Average of last 100 rewards: {}\n".format(sum(r.episode_rewards[-100:]) / 100))

            agent.save_model(directory="/mnt/storage/deepQmodel_simpleA_stdreward", append_timestep=False)

        return True

    runner.run(
        timesteps=60000000,
        episodes=5000,
        max_episode_timesteps=1000,
        deterministic=False,
        episode_finished=episode_finished
    )

    #graph rewards
    plt.scatter(steps, rewardsLog)
    plt.savefig('/mnt/storage/rewardLog_simpleA_stdreward.png')

    runner.close()
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        help="Choose actions deterministically")
    parser.add_argument('-M',
                        '--mode',
                        choices=('tmux', 'child'),
                        default='tmux',
                        help="Starter mode")
    parser.add_argument('-W',
                        '--num-workers',
                        type=int,
                        default=1,
                        help="Number of worker agents")
    parser.add_argument('-C',
                        '--child',
                        action='store_true',
                        help="Child process")
    parser.add_argument('-P',
                        '--parameter-server',
                        action='store_true',
                        help="Parameter server")
    parser.add_argument('-I',
                        '--task-index',
                        type=int,
                        default=0,
                        help="Task index")
    parser.add_argument('-K',
                        '--kill',
                        action='store_true',
                        help="Kill runners")
    parser.add_argument('-L',
                        '--logdir',
                        default='logs_async',
                        help="Log directory")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        help="Show debug outputs")

    args = parser.parse_args()

    session_name = 'OpenAI-' + args.gym_id
    shell = '/bin/bash'

    kill_cmds = [
        "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(
            12222 + args.num_workers),
        "tmux kill-session -t {}".format(session_name),
    ]
    if args.kill:
        os.system("\n".join(kill_cmds))
        return 0

    if not args.child:
        # start up child processes
        target_script = os.path.abspath(inspect.stack()[0][1])

        def wrap_cmd(session, name, cmd):
            if isinstance(cmd, list):
                cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd)
            if args.mode == 'tmux':
                return 'tmux send-keys -t {}:{} {} Enter'.format(
                    session, name, shlex_quote(cmd))
            elif args.mode == 'child':
                return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format(
                    cmd, args.logdir, session, name, args.logdir)

        def build_cmd(ps, index):
            cmd_args = [
                # 'CUDA_VISIBLE_DEVICES=',
                sys.executable,
                target_script,
                args.gym_id,
                '--agent',
                os.path.join(os.getcwd(), args.agent),
                '--network',
                os.path.join(os.getcwd(), args.network),
                '--num-workers',
                args.num_workers,
                '--child',
                '--task-index',
                index
            ]
            if args.episodes is not None:
                cmd_args.append('--episodes')
                cmd_args.append(args.episodes)
            if args.timesteps is not None:
                cmd_args.append('--timesteps')
                cmd_args.append(args.timesteps)
            if args.max_episode_timesteps is not None:
                cmd_args.append('--max-episode-timesteps')
                cmd_args.append(args.max_episode_timesteps)
            if args.deterministic:
                cmd_args.append('--deterministic')
            if ps:
                cmd_args.append('--parameter-server')
            if args.debug:
                cmd_args.append('--debug')
            return cmd_args

        if args.mode == 'tmux':
            cmds = kill_cmds + [
                'tmux new-session -d -s {} -n ps'.format(session_name)
            ]
        elif args.mode == 'child':
            cmds = [
                'mkdir -p {}'.format(args.logdir),
                'rm -f {}/kill.sh'.format(args.logdir),
                'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir),
                'chmod +x {}/kill.sh'.format(args.logdir)
            ]

        cmds.append(wrap_cmd(session_name, 'ps', build_cmd(ps=True, index=0)))

        for i in xrange(args.num_workers):
            name = 'worker{}'.format(i)
            if args.mode == 'tmux':
                cmds.append('tmux new-window -t {} -n {} -d {}'.format(
                    session_name, name, shell))
            cmds.append(
                wrap_cmd(session_name, name, build_cmd(ps=False, index=i)))

        # add one PS call
        # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell))

        print("\n".join(cmds))

        os.system("\n".join(cmds))

        return 0

    ps_hosts = ['127.0.0.1:{}'.format(12222)]
    worker_hosts = []
    port = 12223
    for _ in range(args.num_workers):
        worker_hosts.append('127.0.0.1:{}'.format(port))
        port += 1
    cluster = {'ps': ps_hosts, 'worker': worker_hosts}
    cluster_spec = tf.train.ClusterSpec(cluster)

    environment = OpenAIGym(args.gym_id)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)  # log_levels[agent.log_level])

    stdout_logger = logging.StreamHandler(sys.stdout)
    stdout_logger.setLevel(logging.INFO)
    logger.addHandler(stdout_logger)

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    if args.parameter_server:
        agent['device'] = '/job:ps/task:{}'.format(args.task_index)  # '/cpu:0'
    else:
        agent['device'] = '/job:worker/task:{}'.format(
            args.task_index)  # '/cpu:0'

    agent['execution'] = dict(
        type='distributed',
        distributed_spec=dict(cluster_spec=cluster_spec,
                              task_index=args.task_index,
                              job='ps' if args.parameter_server else 'worker',
                              protocol='grpc'))

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(states=environment.states,
                                        actions=environment.actions,
                                        network=network))

    logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(
        gym_id=args.gym_id))
    logger.info("Config:")
    logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {} after overall {} timesteps. Steps Per Second {}"
                .format(r.agent.episode, r.agent.timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        return True

    runner.run(timesteps=args.timesteps,
               episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)
    runner.close()
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('rom', help="File path of the rom")
    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")
    parser.add_argument(
        '-w',
        '--workers',
        help="Number of threads to run where the model is shared",
        type=int,
        default=16)
    parser.add_argument('-fs',
                        '--frame-skip',
                        help="Number of frames to repeat action",
                        type=int,
                        default=1)
    parser.add_argument('-rap',
                        '--repeat-action-probability',
                        help="Repeat action probability",
                        type=float,
                        default=0.0)
    parser.add_argument('-lolt',
                        '--loss-of-life-termination',
                        help="Loss of life counts as terminal state",
                        action='store_true')
    parser.add_argument('-lolr',
                        '--loss-of-life-reward',
                        help="Loss of life reward/penalty. EX: -1 to penalize",
                        type=float,
                        default=0.0)
    parser.add_argument(
        '-ea',
        '--epsilon-annealing',
        help='Create separate epislon annealing schedules per thread',
        action='store_true')
    parser.add_argument('-ds',
                        '--display-screen',
                        action='store_true',
                        default=False,
                        help="Display emulator screen")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=50000,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--max-timesteps',
                        type=int,
                        default=2000,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # configurable!!!
    logger.addHandler(logging.StreamHandler(sys.stdout))

    environments = [
        ALE(args.rom,
            frame_skip=args.frame_skip,
            repeat_action_probability=args.repeat_action_probability,
            loss_of_life_termination=args.loss_of_life_termination,
            loss_of_life_reward=args.loss_of_life_reward,
            display_screen=args.display_screen) for _ in range(args.workers)
    ]

    if args.network_spec:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent_configs = []
    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    for i in range(args.workers):
        worker_config = deepcopy(agent_config)

        # Optionally overwrite epsilon final values
        if "explorations_spec" in worker_config and worker_config[
                'explorations_spec']['type'] == "epsilon_anneal":
            if args.epsilon_annealing:
                # epsilon final values are [0.5, 0.1, 0.01] with probabilities [0.3, 0.4, 0.3]
                epsilon_final = np.random.choice([0.5, 0.1, 0.01],
                                                 p=[0.3, 0.4, 0.3])
                worker_config['explorations_spec'][
                    "epsilon_final"] = epsilon_final

        agent_configs.append(worker_config)

    # Let the first agent create the model
    # Manually assign model
    logger.info(agent_configs[0])

    agent = Agent.from_spec(spec=agent_configs[0],
                            kwargs=dict(states=environments[0].states,
                                        actions=environments[0].actions,
                                        network=network_spec))

    agents = [agent]

    for i in range(args.workers - 1):
        config = agent_configs[i]
        agent_type = config.pop('type', None)
        worker = WorkerAgentGenerator(AgentsDictionary[agent_type])(
            states=environments[0].states,
            actions=environments[0].actions,
            network=network_spec,
            model=agent.model,
            **config)
        agents.append(worker)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_configs[0])

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    def episode_finished(stats):
        if args.debug:
            logger.info(
                "Thread {t}. Finished episode {ep} after {ts} timesteps. Reward {r}"
                .format(t=stats['thread_id'],
                        ep=stats['episode'],
                        ts=stats['timestep'],
                        r=stats['episode_reward']))
        return True

    def summary_report(r):
        et = time.time()
        logger.info('=' * 40)
        logger.info('Current Step/Episode: {}/{}'.format(
            r.global_step, r.global_episode))
        logger.info('SPS: {}'.format(r.global_step / (et - r.start_time)))
        reward_list = r.episode_rewards
        if len(reward_list) > 0:
            logger.info('Max Reward: {}'.format(np.max(reward_list)))
            logger.info("Average of last 500 rewards: {}".format(
                sum(reward_list[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(reward_list[-100:]) / 100))
        logger.info('=' * 40)

    # Create runners
    threaded_runner = ThreadedRunner(agents,
                                     environments,
                                     repeat_actions=1,
                                     save_path=args.save,
                                     save_episodes=args.save_episodes)

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environments[0]))
    threaded_runner.run(summary_interval=100,
                        episode_finished=episode_finished,
                        summary_report=summary_report)
    threaded_runner.close()
    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=threaded_runner.global_episode))
Exemplo n.º 5
0
Arquivo: ale.py Projeto: niumeng07/dqn
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('rom', help="File path of the rom")
    parser.add_argument('-a', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-spec', default=None, help="Network specification file")
    parser.add_argument('-fs', '--frame-skip', help="Number of frames to repeat action", type=int, default=1)
    parser.add_argument('-rap', '--repeat-action-probability', help="Repeat action probability", type=float, default=0.0)
    parser.add_argument('-lolt', '--loss-of-life-termination', help="Loss of life counts as terminal state", action='store_true')
    parser.add_argument('-lolr', '--loss-of-life-reward', help="Loss of life reward/penalty. EX: -1 to penalize", type=float, default=0.0)
    parser.add_argument('-ds', '--display-screen', action='store_true', default=False, help="Display emulator screen")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # configurable!!!
    logger.addHandler(logging.StreamHandler(sys.stdout))

    environment = ALE(args.rom, frame_skip=args.frame_skip,
                      repeat_action_probability=args.repeat_action_probability,
                      loss_of_life_termination=args.loss_of_life_termination,
                      loss_of_life_reward=args.loss_of_life_reward,
                      display_screen=args.display_screen)

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(
        spec=agent_config,
        kwargs=dict(
            states_spec=environment.states,
            actions_spec=environment.actions,
            network_spec=network_spec
        )
    )

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1
    )

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            sps = r.timestep / (time.time() - r.start_time)
            logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    runner.close()
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    environment.close()
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('path', help="Path to Pycolab game definition file")
    parser.add_argument('-i',
                        '--import-modules',
                        help="Import module(s) required for environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('--visualize',
                        action='store_true',
                        default=False,
                        help="Enable Pycolab game's visualization")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    if args.import_modules is not None:
        for module in args.import_modules.split(','):
            importlib.import_module(name=module)

    if args.path is not None:
        sys.path.append(os.path.dirname(os.path.expanduser(args.path)))
        game_name = os.path.splitext(os.path.basename(args.path))[0]

        try:
            game_env = importlib.import_module(game_name)
        except:
            raise TensorForceError(
                "Could not get game {0} from path {1}".format(
                    game_name, args.path))

    environment = DMPycolab(game=game_env.make_game(),
                            ui=game_env.get_ui(),
                            visualize=args.visualize)

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(
                                states=environment.states,
                                actions=environment.actions,
                                network=network,
                            ))

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}"
                .format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        if args.save and args.save_episodes is not None and not r.episode % args.save_episodes:
            logger.info("Saving agent to {}".format(args.save))
            r.agent.save_model(args.save)

        return True

    runner.run(
        num_timesteps=args.timesteps,
        num_episodes=args.episodes,
        max_episode_timesteps=args.max_episode_timesteps,
        episode_finished=episode_finished,
    )
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n', '--network', default=None, help="Network specification file")
    parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes")
    parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps")
    parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode")
    parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('--monitor', help="Save results to this directory")
    parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")
    parser.add_argument('-te', '--test', action='store_true', default=False, help="Test agent without learning.")
    parser.add_argument('-sl', '--sleep', type=float, default=None, help="Slow down simulation by sleeping for x seconds (fractions allowed).")
    parser.add_argument('--job', type=str, default=None, help="For distributed mode: The job type of this agent.")
    parser.add_argument('--task', type=int, default=0, help="For distributed mode: The task index of this agent.")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    environment = OpenAIGym(
        gym_id=args.gym_id,
        monitor=args.monitor,
        monitor_safe=args.monitor_safe,
        monitor_video=args.monitor_video,
        visualize=args.visualize
    )

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(
        spec=agent,
        kwargs=dict(
            states=environment.states,
            actions=environment.actions,
            network=network,
        )
    )
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.restore_model(args.load)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1
    )

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format(
                r.agent.episode, r.episode_timestep, steps_per_second
            ))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".
                        format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".
                        format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards))))
        if args.save and args.save_episodes is not None and not r.episode % args.save_episodes:
            logger.info("Saving agent to {}".format(args.save))
            r.agent.save_model(args.save)

        return True

    runner.run(
        num_timesteps=args.timesteps,
        num_episodes=args.episodes,
        max_episode_timesteps=args.max_episode_timesteps,
        deterministic=args.deterministic,
        episode_finished=episode_finished,
        testing=args.test,
        sleep=args.sleep
    )
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.agent.episode))
Exemplo n.º 8
0
    def __init__(self,
                 states_spec,
                 actions_spec,
                 network_spec,
                 device=None,
                 session_config=None,
                 scope='dqn-nstep',
                 saver_spec=None,
                 summary_spec=None,
                 distributed_spec=None,
                 optimizer=None,
                 discount=0.99,
                 normalize_rewards=False,
                 variable_noise=None,
                 distributions_spec=None,
                 entropy_regularization=None,
                 target_sync_frequency=10000,
                 target_update_weight=1.0,
                 double_q_model=False,
                 huber_loss=None,
                 preprocessing=None,
                 exploration=None,
                 reward_preprocessing=None,
                 batched_observe=1000,
                 batch_size=32,
                 keep_last_timestep=True):
        """
        Creates a DQN n-step agent.

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            device: Device string specifying model device.
            session_config: optional tf.ConfigProto with additional desired session configurations
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`.
                Available optimizer types include standard TensorFlow optimizers, `natural_gradient`,
                and `evolutionary`. Consult the optimizer test or example configurations for more.
            discount: Float specifying reward discount factor.
            normalize_rewards: Boolean flag specifying whether to normalize rewards, default False.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each
                preprocessor is a dict containing a type and optional necessary arguments.
            exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise)
                and arguments.
            reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            batch_size:
            keep_last_timestep:
        """

        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(type='adam', learning_rate=1e-3)
        else:
            self.optimizer = optimizer

        self.network_spec = network_spec
        self.device = device
        self.session_config = session_config
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.double_q_model = double_q_model
        self.huber_loss = huber_loss

        super(DQNNstepAgent,
              self).__init__(states_spec=states_spec,
                             actions_spec=actions_spec,
                             preprocessing=preprocessing,
                             exploration=exploration,
                             reward_preprocessing=reward_preprocessing,
                             batched_observe=batched_observe,
                             batch_size=batch_size,
                             keep_last_timestep=keep_last_timestep)
Exemplo n.º 9
0
    def update(self,
               states,
               internals,
               actions,
               terminal,
               reward,
               return_loss_per_instance=False):
        fetches = [self.optimization]

        # Optionally fetch loss per instance
        if return_loss_per_instance:
            fetches.append(self.loss_per_instance)

        terminal = np.asarray(terminal)
        batched = (terminal.ndim == 1)
        if batched:
            # TEMP: Random sampling fix
            if self.random_sampling_fix:
                feed_dict = {
                    state_input: states[name][0]
                    for name, state_input in self.states_input.items()
                }
                feed_dict.update({
                    state_input: states[name][1]
                    for name, state_input in self.next_states_input.items()
                })
            else:
                feed_dict = {
                    state_input: states[name]
                    for name, state_input in self.states_input.items()
                }
            feed_dict.update({
                internal_input: internals[n]
                for n, internal_input in enumerate(self.internals_input)
            })
            feed_dict.update({
                action_input: actions[name]
                for name, action_input in self.actions_input.items()
            })
            feed_dict[self.terminal_input] = terminal
            feed_dict[self.reward_input] = reward
        else:
            # TEMP: Random sampling fix
            if self.random_sampling_fix:
                raise TensorForceError("Unbatched version not covered by fix.")
            else:
                feed_dict = {
                    state_input: (states[name], )
                    for name, state_input in self.states_input.items()
                }
            feed_dict.update({
                internal_input: (internals[n], )
                for n, internal_input in enumerate(self.internals_input)
            })
            feed_dict.update({
                action_input: (actions[name], )
                for name, action_input in self.actions_input.items()
            })
            feed_dict[self.terminal_input] = (terminal, )
            feed_dict[self.reward_input] = (reward, )

        feed_dict[self.deterministic_input] = True
        feed_dict[self.update_input] = True

        fetched = self.monitored_session.run(fetches=fetches,
                                             feed_dict=feed_dict)

        if return_loss_per_instance:
            return fetched[1]
Exemplo n.º 10
0
    def get_batch(self, batch_size, next_states=False):
        """
        Samples a batch of the specified size according to priority.

        Args:
            batch_size: The batch size
            next_states: A boolean flag indicating whether 'next_states' values should be included

        Returns: A dict containing states, actions, rewards, terminals, internal states (and next states)

        """
        if batch_size > len(self.observations):
            raise TensorForceError("Batch size is larger than number of observations in memory.")

        states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items()}
        internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internals_config]
        actions = {name: np.zeros((batch_size,) + tuple(action['shape']), dtype=util.np_dtype(action['type'])) for name, action in self.actions_spec.items()}
        terminal = np.zeros((batch_size,), dtype=util.np_dtype('bool'))
        reward = np.zeros((batch_size,), dtype=util.np_dtype('float'))
        if next_states:
            next_states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items()}
            next_internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internals_config]

        self.batch_indices = list()
        not_sampled_index = self.none_priority_index
        sum_priorities = sum(priority for priority, _ in self.observations if priority is not None)
        for n in xrange(batch_size):
            if not_sampled_index < len(self.observations):
                _, observation = self.observations[not_sampled_index]
                index = not_sampled_index
                not_sampled_index += 1
            elif sum_priorities / self.capacity < util.epsilon:
                index = randrange(self.none_priority_index)
                while index in self.batch_indices:
                    index = randrange(self.none_priority_index)
                _, observation = self.observations[index]
            else:
                while True:
                    sample = random()
                    for index, (priority, observation) in enumerate(self.observations):
                        sample -= priority / sum_priorities
                        if sample < 0.0 or index >= self.none_priority_index:
                            break
                    if index not in self.batch_indices:
                        break

            for name, state in states.items():
                state[n] = observation[0][name]
            for k, internal in enumerate(internals):
                internal[n] = observation[1][k]
            for name, action in actions.items():
                action[n] = observation[2][name]
            terminal[n] = observation[3]
            reward[n] = observation[4]
            if next_states:
                for name, next_state in next_states.items():
                    next_state[n] = observation[5][name]
                for k, next_internal in enumerate(next_internals):
                    next_internal[n] = observation[6][k]
            self.batch_indices.append(index)

        if next_states:
            return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals)
        else:
            return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward)
Exemplo n.º 11
0
    def __init__(
            self,
            states_spec,
            actions_spec,
            network_spec,
            device=None,
            session_config=None,
            scope='ppo',
            saver_spec=None,
            summary_spec=None,
            distributed_spec=None,
            discount=0.99,
            normalize_rewards=False,
            variable_noise=None,
            distributions_spec=None,
            entropy_regularization=1e-2,
            baseline_mode=None,
            baseline=None,
            baseline_optimizer=None,
            gae_lambda=None,
            preprocessing=None,
            exploration=None,
            reward_preprocessing=None,
            batched_observe=1000,
            batch_size=1000,
            keep_last_timestep=True,
            likelihood_ratio_clipping=None,
            step_optimizer=None,
            optimization_steps=10
    ):

        # random_sampling=True  # Sampling strategy for replay memory

        """
        Creates a proximal policy optimization agent (PPO), ([Schulman et al., 2017]
        (https://openai-public.s3-us-west-2.amazonaws.com/blog/2017-07/ppo/ppo-arxiv.pdf).

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            device: Device string specifying model device.
            session_config: optional tf.ConfigProto with additional desired session configurations
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            discount: Float specifying reward discount factor.
            normalize_rewards: Boolean flag specifying whether to normalize rewards, default False.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
            baseline_mode: String specifying baseline mode, `states` for a separate baseline per state, `network`
                for sharing parameters with the training network.
            baseline: Optional dict specifying baseline type (e.g. `mlp`, `cnn`), and its layer sizes. Consult
                examples/configs for full example configurations.
            baseline_optimizer: Optional dict specifying an optimizer and its parameters for the baseline following
                the same conventions as the main optimizer.
            gae_lambda: Optional float specifying lambda parameter for generalized advantage estimation.
            preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each
                preprocessor is a dict containing a type and optional necessary arguments.
            exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise)
                and arguments.
            reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            batch_size: Int specifying number of samples collected via `observe` before an update is executed.
            keep_last_timestep: Boolean flag specifying whether last sample is kept, default True.
            likelihood_ratio_clipping: Optional clipping of likelihood ratio between old and new policy.
            step_optimizer: Optimizer dict specification for optimizer used in each PPO update step, defaults to
                Adam if None.
            optimization_steps: Int specifying number of optimization steps to execute on the collected batch using
                the step optimizer.                `
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        self.network_spec = network_spec
        self.device = device
        self.session_config = session_config
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.baseline_mode = baseline_mode
        self.baseline = baseline
        self.baseline_optimizer = baseline_optimizer
        self.gae_lambda = gae_lambda
        self.likelihood_ratio_clipping = likelihood_ratio_clipping

        if step_optimizer is None:
            step_optimizer = dict(
                type='adam',
                learning_rate=1e-4
            )

        self.optimizer = dict(
            type='multi_step',
            optimizer=step_optimizer,
            num_steps=optimization_steps
        )

        super(PPOAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            preprocessing=preprocessing,
            exploration=exploration,
            reward_preprocessing=reward_preprocessing,
            batched_observe=batched_observe,
            batch_size=batch_size,
            keep_last_timestep=keep_last_timestep
        )
Exemplo n.º 12
0
    def __init__(self,
                 states_spec,
                 actions_spec,
                 network_spec,
                 device=None,
                 scope='ppo',
                 saver_spec=None,
                 summary_spec=None,
                 distributed_spec=None,
                 discount=0.99,
                 normalize_rewards=False,
                 variable_noise=None,
                 distributions_spec=None,
                 entropy_regularization=1e-2,
                 baseline_mode=None,
                 baseline=None,
                 baseline_optimizer=None,
                 gae_lambda=None,
                 preprocessing=None,
                 exploration=None,
                 reward_preprocessing=None,
                 batched_observe=1000,
                 batch_size=1000,
                 keep_last_timestep=True,
                 likelihood_ratio_clipping=None,
                 step_optimizer=None,
                 optimization_steps=10):

        # random_sampling=True  # Sampling strategy for replay memory
        """
        Creates a proximal policy optimization agent (PPO), ([Schulman et al., 2017]
        (https://openai-public.s3-us-west-2.amazonaws.com/blog/2017-07/ppo/ppo-arxiv.pdf).

        Args:
            states_spec:
            actions_spec:
            network_spec:
            device:
            scope:
            saver_spec:
            summary_spec:
            distributed_spec:
            discount:
            normalize_rewards:
            variable_noise:
            distributions_spec:
            entropy_regularization:
            baseline_mode:
            baseline:
            baseline_optimizer:
            gae_lambda:
            preprocessing:
            exploration:
            reward_preprocessing:
            batched_observe:
            batch_size:
            keep_last_timestep:
            likelihood_ratio_clipping:
            step_optimizer:
            optimization_steps:
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        self.network_spec = network_spec
        self.device = device
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.baseline_mode = baseline_mode
        self.baseline = baseline
        self.baseline_optimizer = baseline_optimizer
        self.gae_lambda = gae_lambda
        self.likelihood_ratio_clipping = likelihood_ratio_clipping

        if step_optimizer is None:
            step_optimizer = dict(type='adam', learning_rate=1e-4)

        self.optimizer = dict(type='multi_step',
                              optimizer=step_optimizer,
                              num_steps=optimization_steps)

        super(PPOAgent,
              self).__init__(states_spec=states_spec,
                             actions_spec=actions_spec,
                             preprocessing=preprocessing,
                             exploration=exploration,
                             reward_preprocessing=reward_preprocessing,
                             batched_observe=batched_observe,
                             batch_size=batch_size,
                             keep_last_timestep=keep_last_timestep)
Exemplo n.º 13
0
    def tf_apply(self, x, update=False):
        if util.rank(x) != 2:
            raise TensorForceError(
                'Invalid input rank for linear layer: {}, must be 2.'.format(util.rank(x))
            )

        if self.size is None:  # If size is None than Output Matches Input, required for Skip Connections
            self.size = x.shape[1].value

        weights_shape = (x.shape[1].value, self.size)

        if self.weights_init is None:
            stddev = min(0.1, sqrt(2.0 / (x.shape[1].value + self.size)))
            self.weights_init = tf.random_normal_initializer(mean=0.0, stddev=stddev, dtype=tf.float32)

        elif isinstance(self.weights_init, float):
            if self.weights_init == 0.0:
                self.weights_init = tf.zeros_initializer(dtype=tf.float32)
            else:
                self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32)

        elif isinstance(self.weights_init, list):
            self.weights_init = np.asarray(self.weights_init, dtype=np.float32)
            if self.weights_init.shape != weights_shape:
                raise TensorForceError(
                    'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape)
                )
            self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32)

        elif isinstance(self.weights_init, np.ndarray):
            if self.weights_init.shape != weights_shape:
                raise TensorForceError(
                    'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape)
                )
            self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32)

        elif isinstance(self.weights_init, tf.Tensor):
            if util.shape(self.weights_init) != weights_shape:
                raise TensorForceError(
                    'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape)
                )

        bias_shape = (self.size,)

        if isinstance(self.bias_init, bool):
            if self.bias_init:
                self.bias_init = tf.zeros_initializer(dtype=tf.float32)
            else:
                self.bias_init = None

        elif isinstance(self.bias_init, float):
            if self.bias_init == 0.0:
                self.bias_init = tf.zeros_initializer(dtype=tf.float32)
            else:
                self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32)

        elif isinstance(self.bias_init, list):
            self.bias_init = np.asarray(self.bias_init, dtype=np.float32)
            if self.bias_init.shape != bias_shape:
                raise TensorForceError(
                    'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape)
                )
            self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32)

        elif isinstance(self.bias_init, np.ndarray):
            if self.bias_init.shape != bias_shape:
                raise TensorForceError(
                    'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape)
                )
            self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32)

        elif isinstance(self.bias_init, tf.Tensor):
            if util.shape(self.bias_init) != bias_shape:
                raise TensorForceError(
                    'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape)
                )

        if isinstance(self.weights_init, tf.Tensor):
            self.weights = self.weights_init
        else:
            self.weights = tf.get_variable(
                name='W',
                shape=weights_shape,
                dtype=tf.float32,
                initializer=self.weights_init
            )

        x = tf.matmul(a=x, b=self.weights)

        if self.bias_init is None:
            self.bias = None

        else:
            if isinstance(self.bias_init, tf.Tensor):
                self.bias = self.bias_init
            else:
                self.bias = tf.get_variable(name='b', shape=bias_shape, dtype=tf.float32, initializer=self.bias_init)

            x = tf.nn.bias_add(value=x, bias=self.bias)

        return x
Exemplo n.º 14
0
    def __init__(self, states_spec, actions_spec, config, **kwargs):

        # States and actions specifications
        self.states_spec = states_spec
        self.actions_spec = actions_spec

        # Discount factor
        self.discount = config.discount

        # Reward normalization
        assert isinstance(config.normalize_rewards, bool)
        self.normalize_rewards = config.normalize_rewards

        # Variable noise
        assert config.variable_noise is None or config.variable_noise > 0.0
        self.variable_noise = config.variable_noise

        # TensorFlow summaries
        self.summary_labels = set(config.summary_labels or ())

        # Variables and summaries
        self.variables = dict()
        self.all_variables = dict()
        self.summaries = list()

        if not config.local_model or not config.replica_model:
            # If not local_model mode or not internal global model
            self.default_graph = tf.Graph().as_default()
            self.graph = self.default_graph.__enter__()

        if config.cluster_spec is None:
            if config.parameter_server or config.replica_model or config.local_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device
            self.global_model = None

        elif config.parameter_server:
            if config.replica_model or config.local_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device
            self.global_model = None

        elif config.replica_model:
            self.device = tf.train.replica_device_setter(
                worker_device=config.device, cluster=config.cluster_spec)
            self.global_model = None

        elif config.local_model:
            if config.replica_model:
                raise TensorForceError(
                    "Invalid config value for distributed mode.")
            self.device = config.device

            global_config = config.copy()
            global_config.set(key='replica_model', value=True)

            self.global_model = self.__class__(states_spec=states_spec,
                                               actions_spec=actions_spec,
                                               config=global_config,
                                               **kwargs)

        else:
            raise TensorForceError(
                "Invalid config value for distributed mode.")

        with tf.device(device_name_or_function=self.device):

            # Timestep and episode
            # TODO: various modes !!!
            if self.global_model is None:
                # TODO: Variables seem to re-initialize in the beginning every time a runner starts
                self.timestep = tf.get_variable(name='timestep',
                                                dtype=tf.int32,
                                                initializer=0,
                                                trainable=False)
                self.episode = tf.get_variable(name='episode',
                                               dtype=tf.int32,
                                               initializer=0,
                                               trainable=False)
            else:
                self.timestep = self.global_model.timestep
                self.episode = self.global_model.episode

            with tf.name_scope(name=config.scope):

                def custom_getter(getter, name, registered=False, **kwargs):
                    variable = getter(
                        name=name,
                        **kwargs)  # Top-level, hence no 'registered'
                    if not registered and not name.startswith('optimization'):
                        self.all_variables[name] = variable
                        if kwargs.get('trainable', True):
                            self.variables[name] = variable
                        if 'variables' in self.summary_labels:
                            summary = tf.summary.histogram(name=name,
                                                           values=variable)
                            self.summaries.append(summary)
                    return variable

                # Create placeholders, tf functions, internals, etc
                self.initialize(custom_getter=custom_getter)

                # Input tensors
                states = self.get_states(states=self.state_inputs)
                internals = [
                    tf.identity(input=internal)
                    for internal in self.internal_inputs
                ]
                actions = self.get_actions(actions=self.action_inputs)
                terminal = tf.identity(input=self.terminal_input)
                reward = self.get_reward(states=states,
                                         internals=internals,
                                         terminal=terminal,
                                         reward=self.reward_input)

                # Stop gradients for input preprocessing
                states = {
                    name: tf.stop_gradient(input=state)
                    for name, state in states.items()
                }
                actions = {
                    name: tf.stop_gradient(input=action)
                    for name, action in actions.items()
                }
                reward = tf.stop_gradient(input=reward)

                # Optimizer
                if config.optimizer is None:
                    self.optimizer = None
                elif config.local_model and not config.replica_model:
                    # If local_model mode and not internal global model
                    self.optimizer = GlobalOptimizer(
                        optimizer=config.optimizer)
                else:
                    self.optimizer = Optimizer.from_spec(spec=config.optimizer)

                # Create output fetch operations
                self.create_output_operations(states=states,
                                              internals=internals,
                                              actions=actions,
                                              terminal=terminal,
                                              reward=reward,
                                              deterministic=self.deterministic)

        if config.local_model and config.replica_model:
            # If local_model mode and internal global model
            return

        # Local and global initialize operations
        if config.local_model:
            init_op = tf.variables_initializer(
                var_list=(self.global_model.get_variables(
                    include_non_trainable=True)))
            local_init_op = tf.variables_initializer(
                var_list=(self.get_variables(include_non_trainable=True)))

        else:
            init_op = tf.variables_initializer(var_list=(self.get_variables(
                include_non_trainable=True)))
            local_init_op = None

        # Summary operation
        if len(self.get_summaries()) > 0:
            summary_op = tf.summary.merge(inputs=self.get_summaries())
        else:
            summary_op = None

        # TODO: MonitoredSession or so?
        self.supervisor = tf.train.Supervisor(
            is_chief=(config.task_index == 0),
            init_op=init_op,
            local_init_op=local_init_op,
            logdir=config.model_directory,
            summary_op=summary_op,
            global_step=self.timestep,
            save_summaries_secs=config.summary_frequency,
            save_model_secs=config.save_frequency
            # checkpoint_basename='model.ckpt'
            # session_manager=None
        )

        # tf.ConfigProto(device_filters=['/job:ps', '/job:worker/task:{}/cpu:0'.format(self.task_index)])
        if config.parameter_server:
            self.server = tf.train.Server(
                server_or_cluster_def=config.cluster_spec,
                job_name='ps',
                task_index=config.task_index,
                # config=tf.ConfigProto(device_filters=["/job:ps"])
                # config=tf.ConfigProto(
                #     inter_op_parallelism_threads=2,
                #     log_device_placement=True
                # )
            )

            # Param server does nothing actively
            self.server.join()

        elif config.cluster_spec is not None:
            self.server = tf.train.Server(
                server_or_cluster_def=config.cluster_spec,
                job_name='worker',
                task_index=config.task_index,
                # config=tf.ConfigProto(device_filters=["/job:ps"])
                # config=tf.ConfigProto(
                #     inter_op_parallelism_threads=2,
                #     log_device_placement=True
                # )
            )

            self.managed_session = self.supervisor.managed_session(
                master=self.server.target, start_standard_services=True)
            self.session = self.managed_session.__enter__()

        else:
            self.managed_session = self.supervisor.managed_session(
                start_standard_services=True)
            self.session = self.managed_session.__enter__()
Exemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', default='DQNAgent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode")
    # parser.add_argument('-m', '--monitor', help="Save results to this directory")
    # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    env = OpenAIUniverse(args.gym_id)
    env.configure(remotes=1)

    default = dict(
        repeat_actions=1,
        actions=env.actions,
        states=env.states,
        max_episode_length=args.max_timesteps
    )

    if args.agent_config:
        config = Configuration.from_json(args.agent_config)
    else:
        config = Configuration()

    config.default(default)

    if args.network_config:
        network_config = Configuration.from_json(args.network_config).network_layers
    else:
        if config.network_layers:
            network_config = config.network_layers
        else:
            raise TensorForceError("Error: No network configuration provided.")

    if args.debug:
        print("Configuration:")
        print(config)

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[config.log_level])

    stack = None

    agent = create_agent(args.agent, config, network_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(config)

    runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))
        runner.save_model(args.save, args.save_episodes)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            sps = r.total_timesteps / (time.time() - r.start_time)
            logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:])))
            logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:])))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    if args.monitor:
        env.gym.monitor.close()
    env.close()
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '-P',
        '--port',
        default=6025,
        help=
        "Port on which the UE4 Game listens on for incoming RL-client connections"
    )
    parser.add_argument('-H',
                        '--host',
                        default=None,
                        help="Hostname of the UE4 Game (default: localhost)")
    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")
    parser.add_argument('-R',
                        '--random-test-run',
                        action="store_true",
                        help="Do a quick random test run on the env")

    args = parser.parse_args()

    # logging.basicConfig(filename="logfile.txt", level=logging.INFO)
    logging.basicConfig(stream=sys.stderr)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    # We have to connect this remote env to get the specs.
    # We also discretize axis-mappings b/c we will use a deep q-network.
    # Use num_ticks==6 to match Nature paper by Mnih et al.
    # ("human cannot press fire button with more than 10Hz", dt=1/60)
    # TODO: Need to build in capturing and concat'ing last 4 images (plus 8-bit conversion!) into 1 input state signal.
    # TODO: Use pre-processor for that.
    environment = UE4Environment(host=args.host,
                                 port=args.port,
                                 connect=True,
                                 discretize_actions=True,
                                 num_ticks=6)
    environment.seed(200)

    # Do a quick random test-run with image capture of the first n images -> then exit after 1000 steps.
    if args.random_test_run:
        # Reset the env.
        s = environment.reset()
        img = Image.fromarray(
            s, "RGB" if len(environment.states["shape"]) == 3 else "L")
        # Save first received image as a sanity-check.
        img.save("reset.png")
        for i in range(1000):
            s, is_terminal, r = environment.execute(actions=random.choice(
                range(environment.actions["num_actions"])))
            if i < 10:
                img = Image.fromarray(s, "RGB")
                img.save("{:03d}.png".format(i))
            logging.debug("i={} r={} term={}".format(i, r, is_terminal))
            if is_terminal:
                environment.reset()
        quit()

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent_config,
                            kwargs=dict(states_spec=environment.states,
                                        actions_spec=environment.actions,
                                        network_spec=network_spec))
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {} after {} timesteps. Steps Per Second {}".
                format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        return True

    runner.run(timesteps=args.timesteps,
               episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
Exemplo n.º 17
0
def main():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(
        logging.Formatter(
            "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s"
        ))
    logger.addHandler(console_handler)

    parser = argparse.ArgumentParser()

    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")

    args = parser.parse_args()

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent_config['states_preprocessing'] = [{'type': 'flatten'}]

    logger.info("Start training")

    environment = OpenSim(env_id=1, visualize=True)

    agent = Agent.from_spec(spec=agent_config,
                            kwargs=dict(
                                states=environment.states,
                                actions=environment.actions,
                                network=network_spec,
                            ))

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    def episode_finished(r):
        if r.episode % 100 == 0:
            sps = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}"
                .format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Episode timesteps: {}".format(r.episode_timestep))
            logger.info("Episode largest tile: {}".format(
                r.environment.largest_tile))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    runner.run(timesteps=6000000,
               episodes=2,
               max_episode_timesteps=10000,
               episode_finished=episode_finished)

    terminal = False
    state = environment.reset()
    while not terminal:
        action = agent.act(state)
        state, terminal, reward = environment.execute(action)

    runner.close()
Exemplo n.º 18
0
    def __init__(
            self,
            states_spec,
            actions_spec,
            batched_observe=1000,
            scope='learning_agent',
            # parameters specific to LearningAgents
            summary_spec=None,
            network_spec=None,
            discount=0.99,
            device=None,
            session_config=None,
            saver_spec=None,
            distributed_spec=None,
            optimizer=None,
            variable_noise=None,
            states_preprocessing_spec=None,
            explorations_spec=None,
            reward_preprocessing_spec=None,
            distributions_spec=None,
            entropy_regularization=None
    ):
        """
        Initializes the learning agent.

        Args:
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            discount (float): The reward discount factor.
            device: Device string specifying model device.
            session_config: optional tf.ConfigProto with additional desired session configurations
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`.
                Available optimizer types include standard TensorFlow optimizers, `natural_gradient`,
                and `evolutionary`. Consult the optimizer test or example configurations for more.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            states_preprocessing_spec: Optional list of states preprocessors to apply to state
                (e.g. `image_resize`, `greyscale`).
            explorations_spec: Optional dict specifying action exploration type (epsilon greedy
                or Gaussian noise).
            reward_preprocessing_spec: Optional dict specifying reward preprocessing.
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
        """

        # TensorFlow summaries & Configuration Meta Parameter Recorder options
        self.summary_spec = summary_spec
        if self.summary_spec is None:
            self.summary_labels = set()
        else:
            self.summary_labels = set(self.summary_spec.get('labels', ()))

        self.meta_param_recorder = None

        # if 'configuration' in self.summary_labels or 'print_configuration' in self.summary_labels:
        if any(k in self.summary_labels for k in ['configuration', 'print_configuration']):
            self.meta_param_recorder = MetaParameterRecorder(inspect.currentframe())
            if 'meta_dict' in self.summary_spec:
                # Custom Meta Dictionary passed
                self.meta_param_recorder.merge_custom(self.summary_spec['meta_dict'])
            if 'configuration' in self.summary_labels:
                # Setup for TensorBoard population
                self.summary_spec['meta_param_recorder_class'] = self.meta_param_recorder
            if 'print_configuration' in self.summary_labels:
                # Print to STDOUT (TODO: optimize output)
                self.meta_param_recorder.text_output(format_type=1)

        if network_spec is None:
            raise TensorForceError("No network_spec provided.")
        self.network_spec = network_spec

        self.discount = discount
        self.device = device
        self.session_config = session_config
        self.saver_spec = saver_spec
        self.distributed_spec = distributed_spec

        if optimizer is None:
            self.optimizer = dict(
                type='adam',
                learning_rate=1e-3
            )
        else:
            self.optimizer = optimizer

        self.variable_noise = variable_noise
        self.states_preprocessing_spec = states_preprocessing_spec
        self.explorations_spec = explorations_spec
        self.reward_preprocessing_spec = reward_preprocessing_spec
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization

        super(LearningAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            batched_observe=batched_observe,
            scope=scope
        )
    def get_batch(self, batch_size, next_states=False):
        """
        Samples a batch of the specified size according to priority.

        Args:
            batch_size: The batch size
            next_states: A boolean flag indicating whether 'next_states' values should be included

        Returns: A dict containing states, actions, rewards, terminals, internal states (and next states)

        """
        if batch_size > len(self.observations):
            raise TensorForceError(
                "Requested batch size is larger than observations in memory: increase config.first_update."
            )

        # Init empty states
        states = {
            name: np.zeros((batch_size, ) + tuple(state['shape']),
                           dtype=util.np_dtype(state['type']))
            for name, state in self.states_spec.items()
        }
        internals = [
            np.zeros((batch_size, ) + shape, dtype)
            for shape, dtype in self.internals_spec
        ]
        actions = {
            name: np.zeros((batch_size, ) + tuple(action['shape']),
                           dtype=util.np_dtype(action['type']))
            for name, action in self.actions_spec.items()
        }
        terminal = np.zeros((batch_size, ), dtype=util.np_dtype('bool'))
        reward = np.zeros((batch_size, ), dtype=util.np_dtype('float'))
        if next_states:
            next_states = {
                name: np.zeros((batch_size, ) + tuple(state['shape']),
                               dtype=util.np_dtype(state['type']))
                for name, state in self.states_spec.items()
            }
            next_internals = [
                np.zeros((batch_size, ) + shape, dtype)
                for shape, dtype in self.internals_spec
            ]

        # Start with unseen observations
        unseen_indices = list(
            xrange(self.none_priority_index + self.observations._capacity - 1,
                   len(self.observations) + self.observations._capacity - 1))
        self.batch_indices = unseen_indices[:batch_size]

        # Get remaining observations using weighted sampling
        remaining = batch_size - len(self.batch_indices)
        if remaining:
            samples = self.observations.sample_minibatch(remaining)
            sample_indices = [i for i, o in samples]
            self.batch_indices += sample_indices

        # Shuffle
        np.random.shuffle(self.batch_indices)

        # Collect observations
        for n, index in enumerate(self.batch_indices):
            observation, _ = self.observations._memory[index]

            for name, state in states.items():
                state[n] = observation[0][name]
            for k, internal in enumerate(internals):
                internal[n] = observation[1][k]
            for name, action in actions.items():
                action[n] = observation[2][name]
            terminal[n] = observation[3]
            reward[n] = observation[4]
            if next_states:
                for name, next_state in next_states.items():
                    next_state[n] = observation[5][name]
                for k, next_internal in enumerate(next_internals):
                    next_internal[n] = observation[6][k]

        if next_states:
            return dict(states=states,
                        internals=internals,
                        actions=actions,
                        terminal=terminal,
                        reward=reward,
                        next_states=next_states,
                        next_internals=next_internals)
        else:
            return dict(states=states,
                        internals=internals,
                        actions=actions,
                        terminal=terminal,
                        reward=reward)
def main():
    parser = argparse.ArgumentParser()

    # N.b. if ran from within lab, the working directory is something like lab/bazel-out/../../tensorforce
    # Hence, relative paths will not work without first fetching the path of this run file
    parser.add_argument('-id', '--level-id', default='tests/demo_map',help="DeepMind Lab level id")
    parser.add_argument('-a', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-spec', default=None, help="Network specification file")
    parser.add_argument('-e', '--episodes', type=int, default=1000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=200, help="Maximum number of timesteps per episode")
    parser.add_argument('-m', '--monitor', help="Save results to this directory")
    parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=True, help="Show debug outputs")

    # Redirect output to file
    sys.stdout = open('lab_output.txt', 'w')

    args = parser.parse_args()

    environment = DeepMindLab(args.level_id)

    path = os.path.dirname(__file__)
    if args.agent_config:
        # Use absolute path
        agent_config = json.load(path + args.agent_config)
    else:
        raise TensorForceError("No agent configuration provided.")

    if not args.network_spec:
        raise TensorForceError("No network configuration provided.")
    else:
        network_spec = json.load(path + args.network_config)

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)  # configurable!!!

    agent = Agent.from_spec(
        spec=agent_config,
        kwargs=dict(
            states=environment.states,
            actions=environment.actions,
            network=network_spec
        )
    )
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1
    )
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    report_episodes = args.episodes // 1000

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            sps = r.total_timesteps / (time.time() - r.start_time)
            logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:])))
            logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:])))
        return True

    logger.info("Starting {agent} for Lab environment '{env}'".format(agent=agent, env=environment))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    runner.close()
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1))

    environment.close()
Exemplo n.º 21
0
    def tf_observe_timestep(self, states, internals, actions, terminal, reward):
        """

        Args:
            states ():
            internals ():
            actions ():
            terminal ():
            reward ():

        Returns:

        """
        # Store timestep in memory
        stored = self.memory.store(
            states=states,
            internals=internals,
            actions=actions,
            terminal=terminal,
            reward=reward
        )

        # Periodic optimization
        with tf.control_dependencies(control_inputs=(stored,)):
            unit = self.update_mode['unit']
            batch_size = self.update_mode['batch_size']
            frequency = self.update_mode.get('frequency', batch_size)
            first_update = self.update_mode.get('first_update', 0)

            if unit == 'timesteps':
                # Timestep-based batch
                optimize = tf.logical_and(
                    x=tf.equal(x=(self.timestep % frequency), y=0),
                    y=tf.logical_and(
                        x=tf.greater_equal(x=self.timestep, y=batch_size),
                        y=tf.greater_equal(x=self.timestep, y=first_update)
                    )
                )
                batch = self.memory.retrieve_timesteps(n=batch_size)

            elif unit == 'episodes':
                # Episode-based batch
                optimize = tf.logical_and(
                    x=tf.equal(x=(self.episode % frequency), y=0),
                    y=tf.logical_and(
                        # Only update once per episode increment.
                        x=tf.greater(x=tf.count_nonzero(input_tensor=terminal), y=0),
                        y=tf.logical_and(
                            x=tf.greater_equal(x=self.episode, y=batch_size),
                            y=tf.greater_equal(x=self.episode, y=first_update)
                        )
                    )
                )
                batch = self.memory.retrieve_episodes(n=batch_size)

            elif unit == 'sequences':
                # Timestep-sequence-based batch
                sequence_length = self.update_mode.get('length', 8)
                optimize = tf.logical_and(
                    x=tf.equal(x=(self.timestep % frequency), y=0),
                    y=tf.logical_and(
                        x=tf.greater_equal(x=self.timestep, y=(batch_size + sequence_length - 1)),
                        y=tf.greater_equal(x=self.timestep, y=first_update)
                    )
                )
                batch = self.memory.retrieve_sequences(n=batch_size, sequence_length=sequence_length)

            else:
                raise TensorForceError("Invalid update unit: {}.".format(unit))

            # Do not calculate gradients for memory-internal operations.
            batch = util.map_tensors(
                fn=(lambda tensor: tf.stop_gradient(input=tensor)),
                tensors=batch
            )

            return tf.cond(
                pred=optimize,
                true_fn=(lambda: self.fn_optimization(**batch)),
                false_fn=tf.no_op
            )
Exemplo n.º 22
0
    def __init__(self, config, model=None):
        """Initializes the reinforcement learning agent.

        Args:
            config (Configuration): configuration object containing at least `states`, `actions`, `preprocessing` and
                'exploration`.
            model (Model): optional model instance. If not supplied, a new model is created.

        """
        assert self.__class__.name is not None and self.__class__.model is not None
        config.default(Agent.default_config)

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(util.log_levels[config.log_level])

        # states config and preprocessing
        self.preprocessing = dict()
        if 'shape' in config.states:
            # only one state
            config.states = dict(state=config.states)
            self.unique_state = True
            if config.preprocessing is not None:
                config.preprocessing = dict(state=config.preprocessing)
        else:
            self.unique_state = False
        for name, state in config.states:
            state.default(dict(type='float'))
            if isinstance(state.shape, int):
                state.shape = (state.shape, )
            if config.preprocessing is not None and name in config.preprocessing:
                preprocessing = Preprocessing.from_config(
                    config=config.preprocessing[name])
                self.preprocessing[name] = preprocessing
                state.shape = preprocessing.processed_shape(shape=state.shape)

        # actions config and exploration
        self.exploration = dict()
        if 'continuous' in config.actions:
            # only one action
            config.actions = dict(action=config.actions)
            if config.exploration is not None:
                config.exploration = dict(action=config.exploration)
            self.unique_action = True
        else:
            self.unique_action = False
        for name, action in config.actions:
            if action.continuous:
                action.default(dict(shape=(), min_value=None, max_value=None))
            else:
                action.default(dict(shape=()))
            if isinstance(action.shape, int):
                action.shape = (action.shape, )
            if config.exploration is not None and name in config.exploration:
                self.exploration[name] = Exploration.from_config(
                    config=config.exploration[name])

        self.states_config = config.states
        self.actions_config = config.actions

        if model is None:
            self.model = self.__class__.model(config)
        else:
            if not isinstance(model, self.__class__.model):
                raise TensorForceError(
                    "Supplied model class `{}` does not match expected agent model class `{}`"
                    .format(
                        type(model).__name__, self.__class__.model.__name__))
            self.model = model

        not_accessed = config.not_accessed()
        if not_accessed:
            self.logger.warning("Configuration values not accessed: {}".format(
                ', '.join(not_accessed)))

        self.episode = -1
        self.timestep = 0
        self.reset()
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('--monitor', help="Save results to this directory")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled)")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")
    parser.add_argument(
        '--job',
        type=str,
        default=None,
        help="For distributed mode: The job type of this agent.")
    parser.add_argument(
        '--task',
        type=int,
        default=0,
        help="For distributed mode: The task index of this agent.")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    environment = OpenAIGym(gym_id=args.gym_id,
                            monitor=args.monitor,
                            monitor_safe=args.monitor_safe,
                            monitor_video=args.monitor_video)

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    # TEST
    agent["execution"] = dict(
        type="distributed",
        distributed_spec=dict(
            job=args.job,
            task_index=args.task,
            # parameter_server=(args.job == "ps"),
            cluster_spec=dict(ps=["192.168.2.107:22222"],
                              worker=["192.168.2.107:22223"
                                      ]))) if args.job else None
    # END: TEST

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(
                                states=environment.states,
                                actions=environment.actions,
                                network=network,
                            ))
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}"
                .format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        return True

    runner.run(num_timesteps=args.timesteps,
               num_episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
Exemplo n.º 24
0
    def tf_apply(self, x, update):
        inputs_to_merge = list()
        for name in self.inputs:
            # Previous input, by name or "*", like normal network_spec
            # Not using named_tensors as there could be unintended outcome 
            if name == "*" or name == "previous":  
                inputs_to_merge.append(x)
            elif name in self.named_tensors:
                inputs_to_merge.append(self.named_tensors[name])
            else:
                # Failed to find key in available inputs, print out help to user, raise error
                keys=list(self.named_tensors)
                raise TensorForceError(
                    'ComplexNetwork input "{}" doesn\'t exist, Available inputs: {}'.format(name,keys)
                )    
        # Review data for casting to more precise format so TensorFlow doesn't throw error for mixed data
        # Quick & Dirty cast only promote types: bool=0,int32=10, int64=20, float32=30, double=40
        #            
        cast_type_level = 0
        cast_type_dict = {'bool':0, 'int32':10, 'int64':20, 'float32':30, 'float64':40}                
        cast_type_func_dict = {0:tf.identity, 10:tf.to_int32, 20:tf.to_int64, 30:tf.to_float, 40:tf.to_double}    
        # Scan inputs for max cast_type            
        for tensor in inputs_to_merge:
            key=str(tensor.dtype.name)
            if key in cast_type_dict:
                if cast_type_dict[key] > cast_type_level:
                    cast_type_level = cast_type_dict[key]
            else:
                raise TensorForceError('Network spec "input" doesn\'t support dtype {}'.format(keys)
   
        # Add casting if needed
        for index,tensor in enumerate(inputs_to_merge):
            key=str(tensor.dtype.name)
            if cast_type_dict[key] < cast_type_level:
                inputs_to_merge[index]=cast_type_func_dict[cast_type_level](tensor)

        input_tensor = tf.concat(inputs_to_merge,self.axis)
        return input_tensor


class Output(Layer):
    """
    Output layer. Used for ComplexLayerNetwork's to capture the tensor
    under and name for use with Input layers.  Acts as a input to output passthrough.
    """
    def __init__(self,
        output,
        scope='output',
        summary_labels=()):
        """
        Output layer.

        Args:
            output: A string that names the tensor, will be added to available inputs

        """  
        self.output = output
        super(Output, self).__init__(scope=scope, summary_labels=summary_labels)

    def tf_apply(self, x, update):
        self.named_tensors[self.output]=x
        return x


class ComplexLayeredNetwork(LayerBasedNetwork):
    """
    Complex Network consisting of a sequence of layers, which can be created from a specification dict.
    """
    def __init__(self, complex_layers_spec, scope='layered-network', summary_labels=()):
        """
        Complex Layered network.

        Args:
            complex_layers_spec: List of layer specification dicts
        """
        super(ComplexLayeredNetwork, self).__init__(scope=scope, summary_labels=summary_labels)
        self.complex_layers_spec = complex_layers_spec
        self.Inputs = dict()

        layer_counter = Counter()

        for branch_spec in self.complex_layers_spec:
            for layer_spec in branch_spec:
                if isinstance(layer_spec['type'], str):
                    name = layer_spec['type']
                else:
                    name = 'layer'
                scope = name + str(layer_counter[name])
                layer_counter[name] += 1

                layer = Layer.from_spec(
                    spec=layer_spec,
                    kwargs=dict(scope=scope, summary_labels=summary_labels)
                )
                # Link named dictionary reference into Layer
                layer.tf_tensors(named_tensors=self.Inputs)
                self.add_layer(layer=layer)

    def tf_apply(self, x, internals, update, return_internals=False):
        if isinstance(x, dict):
            self.Inputs.update(x) 
            if len(x) == 1:              
                x = next(iter(x.values()))         

        internal_outputs = list()
        index = 0
        for layer in self.layers:
            layer_internals = [internals[index + n] for n in range(layer.num_internals)]
            index += layer.num_internals
            x = layer.apply(x, update, *layer_internals)

            if not isinstance(x, tf.Tensor):
                internal_outputs.extend(x[1])
                x = x[0]

        if return_internals:
            return x, internal_outputs
        else:
            return x

    @staticmethod
    def from_json(filename):  # TODO: NOT TESTED
        """
        Creates a complex_layered_network_builder from a JSON.

        Args:
            filename: Path to configuration

        Returns: A ComplexLayeredNetwork class with layers generated from the JSON
        """
        path = os.path.join(os.getcwd(), filename)
        with open(path, 'r') as fp:
            config = json.load(fp=fp)
        return ComplexLayeredNetwork(layers_spec=config)
Exemplo n.º 25
0
    def create_tf_operations(self, config):
        """
        Creates generic TensorFlow operations and placeholders required for models.
        
        Args:
            config: Model configuration which must contain entries for states and actions.

        Returns:

        """
        self.action_taken = dict()
        self.internal_inputs = list()
        self.internal_outputs = list()
        self.internal_inits = list()

        # Placeholders
        with tf.variable_scope('placeholder'):
            # States
            self.state = dict()
            for name, state in config.states.items():
                self.state[name] = tf.placeholder(
                    dtype=util.tf_dtype(state.type),
                    shape=(None, ) + tuple(state.shape),
                    name=name)

            # Actions
            self.action = dict()
            self.discrete_actions = []
            self.continuous_actions = []
            for name, action in config.actions:
                if action.continuous:
                    if not self.__class__.allows_continuous_actions:
                        raise TensorForceError(
                            "Error: Model does not support continuous actions."
                        )
                    self.action[name] = tf.placeholder(
                        dtype=util.tf_dtype('float'),
                        shape=(None, ),
                        name=name)
                else:
                    if not self.__class__.allows_discrete_actions:
                        raise TensorForceError(
                            "Error: Model does not support discrete actions.")
                    self.action[name] = tf.placeholder(
                        dtype=util.tf_dtype('int'), shape=(None, ), name=name)

            # Reward & terminal
            self.reward = tf.placeholder(dtype=tf.float32,
                                         shape=(None, ),
                                         name='reward')
            self.terminal = tf.placeholder(dtype=tf.bool,
                                           shape=(None, ),
                                           name='terminal')

            # Deterministic action flag
            self.deterministic = tf.placeholder(dtype=tf.bool,
                                                shape=(),
                                                name='deterministic')

        # Optimizer
        if config.optimizer is not None:
            learning_rate = config.learning_rate
            with tf.variable_scope('optimization'):
                optimizer = util.function(config.optimizer, optimizers)
                args = config.optimizer_args or ()
                kwargs = config.optimizer_kwargs or {}
                self.optimizer = optimizer(learning_rate, *args, **kwargs)
        else:
            self.optimizer = None
Exemplo n.º 26
0
    def __init__(
        self,
        states_spec,
        actions_spec,
        device=None,
        scope='constant',
        saver_spec=None,
        summary_spec=None,
        distributed_spec=None,
        discount=0.99,
        normalize_rewards=False,
        variable_noise=None,
        preprocessing=None,
        exploration=None,
        reward_preprocessing=None,
        batched_observe=1000,
        action_values=None
     ):
        """
        Initializes a constant agent which returns a constant action of the provided shape.

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            device: Device string specifying model device.
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            discount: Float specifying reward discount factor.
            normalize_rewards: Boolean flag specifying whether to normalize rewards, default False.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each
                preprocessor is a dict containing a type and optional necessary arguments.
            exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise)
                and arguments.
            reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            action_values: Action value specification, must match actions_spec names
        """

        if action_values is None:
            raise TensorForceError("No action_values for constant model provided.")

        self.optimizer = None
        self.device = device
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.action_values = action_values

        super(ConstantAgent, self).__init__(
            states_spec,
            actions_spec,
            preprocessing=preprocessing,
            exploration=exploration,
            reward_preprocessing=reward_preprocessing,
            batched_observe=batched_observe
         )
Exemplo n.º 27
0
    def __init__(
        self,
        states_spec,
        actions_spec,
        network_spec,
        device=None,
        session_config=None,
        scope='trpo',
        saver_spec=None,
        summary_spec=None,
        distributed_spec=None,
        discount=0.99,
        variable_noise=None,
        states_preprocessing_spec=None,
        explorations_spec=None,
        reward_preprocessing_spec=None,
        distributions_spec=None,
        entropy_regularization=None,
        baseline_mode=None,
        baseline=None,
        baseline_optimizer=None,
        gae_lambda=None,
        batched_observe=1000,
        batch_size=1000,
        keep_last_timestep=True,
        likelihood_ratio_clipping=None,
        learning_rate=1e-3,
        cg_max_iterations=20,
        cg_damping=1e-3,
        cg_unroll_loop=False
    ):
        """
        Creates a Trust Region Policy Optimization ([Schulman et al., 2015](https://arxiv.org/abs/1502.05477)) agent.

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            device: Device string specifying model device.
            session_config: optional tf.ConfigProto with additional desired session configurations
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            discount: Float specifying reward discount factor.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            states_preprocessing_spec: Optional list of states preprocessors to apply to state  
                (e.g. `image_resize`, `grayscale`).
            explorations_spec: Optional dict specifying action exploration type (epsilon greedy  
                or Gaussian noise).
            reward_preprocessing_spec: Optional dict specifying reward preprocessing.
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
            baseline_mode: String specifying baseline mode, `states` for a separate baseline per state, `network`
                for sharing parameters with the training network.
            baseline: Optional dict specifying baseline type (e.g. `mlp`, `cnn`), and its layer sizes. Consult
             examples/configs for full example configurations.
            baseline_optimizer: Optional dict specifying an optimizer and its parameters for the baseline
                following the same conventions as the main optimizer.
            gae_lambda: Optional float specifying lambda parameter for generalized advantage estimation.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            batch_size: Int specifying number of samples collected via `observe` before an update is executed.
            keep_last_timestep: Boolean flag specifying whether last sample is kept, default True.
            likelihood_ratio_clipping: Optional clipping of likelihood ratio between old and new policy.
            learning_rate: Learning rate which may be interpreted differently according to optimizer, e.g. a natural
                gradient optimizer interprets the learning rate as the max kl-divergence between old and updated policy.
            cg_max_iterations: Int > 0 specifying conjugate gradient iterations, typically 10-20 are sufficient to
                find effective approximate solutions.
            cg_damping: Conjugate gradient damping value to increase numerical stability.
            cg_unroll_loop: Boolean indicating whether loop unrolling in TensorFlow is to be used which seems to
                impact performance negatively at this point, default False.
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        self.optimizer = dict(
            type='optimized_step',
            optimizer=dict(
                type='natural_gradient',
                learning_rate=learning_rate,
                cg_max_iterations=cg_max_iterations,
                cg_damping=cg_damping,
                cg_unroll_loop=cg_unroll_loop,
            ),
            ls_max_iterations=10,
            ls_accept_ratio=0.9,
            ls_mode='exponential',
            ls_parameter=0.5,
            ls_unroll_loop=False
        )

        self.network_spec = network_spec
        self.device = device
        self.session_config = session_config
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.variable_noise = variable_noise
        self.states_preprocessing_spec = states_preprocessing_spec
        self.explorations_spec = explorations_spec
        self.reward_preprocessing_spec = reward_preprocessing_spec
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.baseline_mode = baseline_mode
        self.baseline = baseline
        self.baseline_optimizer = baseline_optimizer
        self.gae_lambda = gae_lambda
        self.likelihood_ratio_clipping = likelihood_ratio_clipping

        super(TRPOAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            batched_observe=batched_observe,
            batch_size=batch_size,
            keep_last_timestep=keep_last_timestep
        )
Exemplo n.º 28
0
    def __init__(
        self,
        states_spec,
        actions_spec,
        network_spec,
        device=None,
        scope='dqn',
        saver_spec=None,
        summary_spec=None,
        distributed_spec=None,
        optimizer=None,
        discount=0.99,
        normalize_rewards=False,
        variable_noise=None,
        distributions_spec=None,
        entropy_regularization=None,
        target_sync_frequency=10000,
        target_update_weight=1.0,
        double_q_model=False,
        huber_loss=None,
        preprocessing=None,
        exploration=None,
        reward_preprocessing=None,
        batched_observe=1000,
        batch_size=32,
        memory=None,
        first_update=10000,
        update_frequency=4,
        repeat_update=1
    ):
        """
        Creates a Deep-Q agent.

        Args:
            states_spec:
            actions_spec:
            network_spec:
            device:
            scope:
            saver_spec:
            summary_spec:
            distributed_spec:
            optimizer:
            discount:
            normalize_rewards:
            variable_noise:
            distributions_spec:
            entropy_regularization:
            target_sync_frequency:
            target_update_weight:
            double_q_model:
            huber_loss:
            preprocessing:
            exploration:
            reward_preprocessing:
            batched_observe:
            batch_size:
            memory:
            first_update:
            update_frequency:
            repeat_update:
        """

        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(
                type='adam',
                learning_rate=1e-3
        )
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(
                type='replay',
                capacity=100000
            )
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.double_q_model = double_q_model
        self.huber_loss = huber_loss

        super(DQNAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            preprocessing=preprocessing,
            exploration=exploration,
            reward_preprocessing=reward_preprocessing,
            batched_observe=batched_observe,
            batch_size=batch_size,
            memory=memory,
            first_update=first_update,
            update_frequency=update_frequency,
            repeat_update=repeat_update
        )
Exemplo n.º 29
0
    def tf_apply(self, x, update):
        if self.beta_learn:
            self.beta = tf.get_variable(
                name='beta',
                shape=(),
                dtype=tf.float32,
                initializer=tf.ones_initializer()
            )

        if self.max is not None:
            x = tf.minimum(x=(self.beta * x), y=self.max)

        if self.min is not None:
            x = tf.maximum(x=(self.beta * x), y=self.min)

        if self.name == 'elu':
            x = tf.nn.elu(features=(self.beta * x))

        elif self.name == 'none':
            x = tf.identity(input=(self.beta * x))

        elif self.name == 'relu':
            x = tf.nn.relu(features=(self.beta * x))
            if 'relu' in self.summary_labels:
                non_zero = tf.cast(x=tf.count_nonzero(input_tensor=x), dtype=tf.float32)
                size = tf.cast(x=tf.reduce_prod(input_tensor=tf.shape(input=x)), dtype=tf.float32)
                tf.contrib.summary.scalar(name='relu', tensor=(non_zero / size))

        elif self.name == 'selu':
            # https://arxiv.org/pdf/1706.02515.pdf
            x = tf.nn.selu(features=(self.beta * x))

        elif self.name == 'sigmoid':
            x = tf.sigmoid(x=(self.beta * x))

        elif self.name == 'swish':
            # https://arxiv.org/abs/1710.05941
            x = tf.sigmoid(x=(self.beta * x)) * x

        elif self.name == 'lrelu' or self.name == 'leaky_relu':
            if self.alpha is None:
                # Default alpha value for leaky_relu
                self.alpha = 0.2
            x = tf.nn.leaky_relu(features=(self.beta * x), alpha=self.alpha)

        elif self.name == 'crelu':
            x = tf.nn.crelu(features=(self.beta * x))

        elif self.name == 'softmax':
            x = tf.nn.softmax(logits=(self.beta * x))

        elif self.name == 'softplus':
            x = tf.nn.softplus(features=(self.beta * x))

        elif self.name == 'softsign':
            x = tf.nn.softsign(features=(self.beta * x))

        elif self.name == 'tanh':
            x = tf.nn.tanh(x=(self.beta * x))

        else:
            raise TensorForceError('Invalid non-linearity: {}'.format(self.name))

        if 'beta' in self.summary_labels:
            tf.contrib.summary.scalar(name='beta', tensor=self.beta)

        return x
Exemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--mode', help="ID of the game mode")
    parser.add_argument('--hide',
                        dest='hide',
                        action='store_const',
                        const=True,
                        default=False,
                        help="Hide output window")
    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=50000,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--max-timesteps',
                        type=int,
                        default=2000,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # configurable!!!

    environment = MazeExplorer(mode_id=args.mode, visible=not args.hide)

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent_config,
                            kwargs=dict(states_spec=environment.states,
                                        actions_spec=environment.actions,
                                        network_spec=network_spec))

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            sps = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}"
                .format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))
    runner.run(args.episodes,
               args.max_timesteps,
               episode_finished=episode_finished)
    runner.close()
    logger.info(
        "Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    environment.close()