示例#1
0
文件: dqn_atari.py 项目: rmst/chi
def dqn_atari(logdir, env='Pong', memory_size=100000):
  import numpy as np
  import gym
  import tensorflow as tf
  from gym import wrappers
  from tensorflow.contrib import layers
  from tensorflow.contrib.framework import arg_scope
  from chi.util import in_collections

  chi.set_loglevel('debug')
  log_top(logdir+'/logs/top')
  log_nvidia_smi(logdir+'/logs/nvidia-smi')


  env += 'NoFrameskip-v3'
  env = gym.make(env)
  env = chi.rl.wrappers.AtariWrapper(env)
  env = chi.rl.wrappers.StackFrames(env, 4)
  env = wrappers.SkipWrapper(4)(env)

  test = 10
  train = 40
  env = monitor = wrappers.Monitor(env, logdir+'/monitor', video_callable=lambda i: i % (test+train) == 0 or i % (test+train) == train)

  print_env(env)

  @chi.model(tracker=tf.train.ExponentialMovingAverage(1-.0005),  # TODO: replace with original weight freeze
             optimizer=tf.train.RMSPropOptimizer(.00025, .95, .95, .01))
  def q_network(x):
    x /= 255
    x = layers.conv2d(x, 32, 8, 4)
    x = layers.conv2d(x, 64, 4, 2)
    x = layers.conv2d(x, 64, 3, 1)
    x = layers.flatten(x)
    x = layers.fully_connected(x, 512)
    x = layers.fully_connected(x, env.action_space.n, activation_fn=None)
    x = tf.identity(x, name='Q')
    return x

  memory = chi.rl.ReplayMemory(memory_size, 32)

  agent = DqnAgent(env, q_network, memory)

  from time import time
  step = monitor.get_total_steps()
  t = time()
  for ep in range(100000):
    for _ in range(train):
      agent.play_episode()

    for _ in range(test):
      agent.play_episode(test=True)

    ar = np.mean(monitor.get_episode_rewards()[-(train+test):-test])
    at = np.mean(monitor.get_episode_rewards()[-test:])
    ds = monitor.get_total_steps() - step
    step = monitor.get_total_steps()
    dt = time() - t
    t = time()
    print(f'av. test return {at}, av. train return {ar}, av. fps {ds/dt}')
示例#2
0
 def __init__(self, f, local_dependencies=None, start_chiboard=True):
   chi.set_loglevel('debug')
   super().__init__(f)
   self.start_chiboard = start_chiboard
   self.f = f
   self.local_dependencies = local_dependencies or []
   self.should_stop = False
   self.config = None
   self.logdir = None
   self.writers = {}
   self.global_step = None
   from inspect import Parameter
   params = dict(daemon=Parameter('daemon',
                                  Parameter.KEYWORD_ONLY,
                                  default=False,
                                  annotation="run in background"),
                 logdir=Parameter('logdir',
                                  Parameter.KEYWORD_ONLY,
                                  default=""))
   params.update(self.params)
   self.params = params
示例#3
0
def dqn_atari(self: Experiment,
              logdir=None,
              env='Pong',
              frameskip=4,
              timesteps=100000000,
              memory_size=100000,
              agents=2,
              replay_start=50000,
              tter=.25,
              duelling=True):
    from tensorflow.contrib import layers
    import gym
    from gym import wrappers
    import numpy as np

    chi.set_loglevel('debug')
    log_nvidia_smi(logdir + '/logs/nvidia-smi')

    class RenderMeta(chi.rl.Wrapper):
        def __init__(self, env, limits=None):
            super().__init__(env)
            self.an = env.action_space.n
            # self.q_plotters = [Plotter(limits=None, title=f'A({n})') for n in action_names]
            self.f, ax = plt.subplots(2, 2, figsize=(2 * 3, 2 * 2), dpi=64)
            self.f.set_tight_layout(True)
            ax = iter(np.reshape(ax, -1))
            self.q = Plotter(next(ax),
                             legend=[str(i) for i in range(self.an)],
                             title='Q - mean Q')
            self.qm = Plotter(next(ax), title='mean Q')
            self.r = Plotter(next(ax),
                             legend=['wrapped', 'unwrapped'],
                             title='reward')

        def _reset(self):
            obs = super()._reset()
            self.last_frame = np.tile(obs[:, :, -1:], (1, 1, 3))

            self.last_q = None
            return obs

        def _step(self, action):
            ob, r, done, info = super()._step(action)
            self.last_frame = np.tile(ob[:, :, -1:], (1, 1, 3))
            qs = self.meta.get('action_values', np.full(self.an, np.nan))
            qm = np.mean(qs)
            qs -= qm
            # [qp.append(qs[i, ...]) for i, qp in enumerate(self.q_plotters)]
            self.qm.append(qm)
            self.q.append(qs)
            self.r.append((r, info.get('unwrapped_reward',
                                       r), self.meta.get('td', np.nan)))
            return ob, r, done, info

        def _render(self, mode='human', close=False):
            f = super()._render(mode, close)
            # fs = [qp.draw() for qp in self.q_plotters]
            f2 = draw(self.f)
            return chi.rl.util.concat_frames(f, self.last_frame, f2)

    def make_env(i):
        e = env + 'NoFrameskip-v3'
        e = gym.make(e)
        e = chi.rl.wrappers.AtariWrapper(e)
        e = chi.rl.wrappers.StackFrames(e, 4)
        e = chi.rl.wrappers.SkipWrapper(e, 4)

        if i == 0:
            e = RenderMeta(e)
        e = wrappers.Monitor(e,
                             self.logdir + '/monitor_' + str(i),
                             video_callable=lambda j: j %
                             (20 if i == 0 else 200) == 0 if i < 4 else False)

        return e

    envs = [make_env(i) for i in range(agents)]
    env = envs[0]
    print_env(env)

    if duelling:
        # https://arxiv.org/abs/1511.06581

        @chi.model(
            tracker=tf.train.ExponentialMovingAverage(
                1 - .0005),  # TODO: replace with original weight freeze
            optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01))
        def q_network(x):
            x /= 255
            x = layers.conv2d(x, 32, 8, 4)
            x = layers.conv2d(x, 64, 4, 2)
            x = layers.conv2d(x, 64, 3, 1)
            x = layers.flatten(x)

            xv = layers.fully_connected(x, 512)
            val = layers.fully_connected(xv, 1, activation_fn=None)
            # val = tf.squeeze(val, 1)

            xa = layers.fully_connected(x, 512)
            adv = layers.fully_connected(xa,
                                         env.action_space.n,
                                         activation_fn=None)

            q = val + adv - tf.reduce_mean(adv, axis=1, keep_dims=True)
            q = tf.identity(q, name='Q')
            return q
    else:

        @chi.model(
            tracker=tf.train.ExponentialMovingAverage(
                1 - .0005),  # TODO: replace with original weight freeze
            optimizer=tf.train.RMSPropOptimizer(.00025, .95, .95, .01))
        def q_network(x):
            x /= 255
            x = layers.conv2d(x, 32, 8, 4)
            x = layers.conv2d(x, 64, 4, 2)
            x = layers.conv2d(x, 64, 3, 1)
            x = layers.flatten(x)
            x = layers.fully_connected(x, 512)
            x = layers.fully_connected(x,
                                       env.action_space.n,
                                       activation_fn=None)
            x = tf.identity(x, name='Q')
            return x

    dqn = DQN(env.action_space.n,
              env.observation_space.shape,
              q_network,
              replay_start=replay_start,
              logdir=logdir)

    for i, env in enumerate(envs):
        agent = dqn.make_agent(test=i in (0, 1),
                               memory_size=memory_size // agents,
                               logdir=logdir,
                               name=f'Agent_{i}')
        agent.run(env, async=True)

    log_top(logdir + '/logs/top')

    dqn.train(timesteps, tter)
示例#4
0
""" Tutorial for chi.model
    ----------------------
    This is how we can use python functions to define models
"""
import os
import chi
import tensorflow as tf
from tensorflow.contrib import layers  # Keras-style layers
from tensorflow.contrib import learn

chi.set_loglevel('debug')  # log whenever variables are created or shared


@chi.model
def my_digit_classifier(x: (None, 28 * 28)):  # specify shape as (None, 28*28)
    y = layers.fully_connected(x, 100)
    z = layers.fully_connected(y, 10, None)
    p = layers.softmax(z)
    return x, y, z, p


@chi.function
def train(x, labels: tf.int32):
    x, y, z, p = my_digit_classifier(
        x)  # create model parameters (first usage of my_model)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                          logits=z)

    gav = my_digit_classifier.compute_gradients(loss)

    th = my_digit_classifier.apply_gradients(gav)
示例#5
0
import tensorflow as tf
import numpy as np
from tensorflow.contrib import layers
from tensorflow.contrib import learn
import chi
from chi.util import ClippingOptimizer

chi.set_loglevel('debug')


@chi.experiment(local_dependencies=[chi])
def wgan_conv(alpha=.00005, c=.01, m=64, n_critic=5, logdir="~/chi-results/+"):
    """ Wasserstein GAN with convolutional nets
  Paper: https://arxiv.org/abs/1701.07875
  """
    @chi.model(optimizer=ClippingOptimizer(tf.train.RMSPropOptimizer(alpha),
                                           -c, c))
    def critic(x: [[64, 64, 3]]):
        x = layers.conv2d(x, 128, 5, 2)
        x = layers.conv2d(x, 256, 5, 2)
        x = layers.conv2d(x, 512, 5, 2)
        x = layers.conv2d(x, 1024, 5, 2)
        y = layers.fully_connected(x, 1, None)  # linear
        return y

    @chi.model(optimizer=tf.train.RMSPropOptimizer(alpha))
    def generator(z):
        zp = layers.fully_connected(z, 1024 * 4 * 4, None)
        x = tf.reshape(zp, [m, 4, 4, 1024])
        x = layers.conv2d_transpose(x, 512, 5, 2)
        x = layers.conv2d_transpose(x, 256, 5, 2)
示例#6
0
def dqn_car(self: Experiment, logdir=None, frameskip=5,
            timesteps=100000000, memory_size=100000,
            agents=3,
            replay_start=50000,
            tter=.25,
            duelling=True):
  from tensorflow.contrib import layers
  import gym
  from gym import wrappers
  import numpy as np

  chi.set_loglevel('debug')
  log_top(logdir + '/logs/top')
  log_nvidia_smi(logdir + '/logs/nvidia-smi')

  actions = [[0, 1], [0, -1],
             [1, 1], [1, -1],
             [-1, 1], [-1, -1]]
  action_names = ['fw', 'bw', 'fw_r', 'bw_r', 'fw_l', 'bw_l']

  class RenderMeta(chi.rl.Wrapper):
    def __init__(self, env, limits=None, mod=False):
      super().__init__(env)
      self.mod = mod
      self.an = env.action_space.n
      # self.q_plotters = [Plotter(limits=None, title=f'A({n})') for n in action_names]
      self.f, ax = plt.subplots(2, 4, figsize=(4 * 3, 2 * 2), dpi=64)
      self.f.set_tight_layout(True)
      ax = iter(np.reshape(ax, -1))
      self.q = Plotter(next(ax), title='Q - mean Q', legend=action_names)
      self.mq = Plotter(next(ax), title='mean Q')
      self.r = Plotter(next(ax), limits=None, title='reward')
      self.s = Plotter(next(ax), title='speed')
      self.a = Plotter(next(ax), title='av_speed')
      self.d = Plotter(next(ax), title='distance_from_road')
      self.td = Plotter(next(ax), title='td')

    def _step(self, action):
      ob, r, done, info = super()._step(action)
      qs = self.meta.get('action_values', np.full(self.an, np.nan))
      mq = np.mean(qs)
      qs -= mq
      # [qp.append(qs[i, ...]) for i, qp in enumerate(self.q_plotters)]

      self.q.append(qs)
      self.mq.append(mq)
      self.r.append(r)
      self.s.append(info.get('speed', np.nan))
      self.a.append(info.get('average_speed', np.nan))
      self.d.append(info.get('distance_from_road', np.nan))
      self.td.append(self.meta.get('td', 0))

      return ob, r, done, info

    def _render(self, mode='human', close=False):
      f = super()._render(mode, close)
      # fs = [qp.draw() for qp in self.q_plotters]
      f2 = draw(self.f)
      return chi.rl.util.concat_frames(f, self.obs, f2)

    def _observation(self, observation):
      if self.mod:
        observation = np.asarray(np.random.normal(observation, 30), dtype=np.uint8)
        np.clip(observation, 0, 255, observation)
      self.obs = np.tile(observation[:, :, np.newaxis], (1, 1, 3))
      return observation

  class ScaleRewards(gym.Wrapper):
    def _step(self, a):
      ob, r, d, i = super()._step(a)
      i.setdefault('unwrapped_reward', r)
      r /= frameskip
      return ob, r, d, i

  def make_env(i):
    import rlunity
    env = gym.make('UnityCarPixels-v0')
    r = 100
    env.unwrapped.conf(loglevel='info', log_unity=True, logfile=logdir + f'/logs/unity_{i}', w=r, h=r)

    env = DiscretizeActions(env, actions)

    if i in (0, 1):
      env = RenderMeta(env, mod=i == 1)
    env = wrappers.Monitor(env, self.logdir + '/monitor_' + str(i),
                           video_callable=lambda j: j % (5 if i in (0, 1) else 200) == 0)

    # env = wrappers.SkipWrapper(frameskip)(env)
    # env = ScaleRewards(env)
    env = chi.rl.StackFrames(env, 4)
    return env

  envs = [make_env(i) for i in range(agents)]
  env = envs[0]
  print_env(env)

  if duelling:
    # https://arxiv.org/abs/1511.06581

    @chi.model(tracker=tf.train.ExponentialMovingAverage(1 - .0005),  # TODO: replace with original weight freeze
               optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01))
    def q_network(x):
      x /= 255
      x = layers.conv2d(x, 32, 8, 4)
      x = layers.conv2d(x, 64, 4, 2)
      x = layers.conv2d(x, 64, 3, 1)
      x = layers.flatten(x)

      xv = layers.fully_connected(x, 512)
      val = layers.fully_connected(xv, 1, activation_fn=None)
      # val = tf.squeeze(val, 1)

      xa = layers.fully_connected(x, 512)
      adv = layers.fully_connected(xa, env.action_space.n, activation_fn=None)

      q = val + adv - tf.reduce_mean(adv, axis=1, keep_dims=True)
      q = tf.identity(q, name='Q')
      return q
  else:
    @chi.model(tracker=tf.train.ExponentialMovingAverage(1 - .0005),  # TODO: replace with original weight freeze
               optimizer=tf.train.RMSPropOptimizer(.00025, .95, .95, .01))
    def q_network(x):
      x /= 255
      x = layers.conv2d(x, 32, 8, 4)
      x = layers.conv2d(x, 64, 4, 2)
      x = layers.conv2d(x, 64, 3, 1)
      x = layers.flatten(x)
      x = layers.fully_connected(x, 512)
      x = layers.fully_connected(x, env.action_space.n, activation_fn=None)
      x = tf.identity(x, name='Q')
      return x

  dqn = DQN(env.action_space.n,
            env.observation_space.shape,
            q_network,
            clip_td=False,
            replay_start=replay_start,
            logdir=logdir)

  for i, env in enumerate(envs):
    agent = dqn.make_agent(test=i in (0, 1), memory_size=memory_size // agents, logdir=logdir, name=f'Agent_{i}')
    agent.run(env, async=True)

  dqn.train(timesteps, tter)
示例#7
0
def dqn_atari(self: Experiment,
              logdir=None,
              env='Pong',
              frameskip=4,
              timesteps=100000000,
              memory_size=100000,
              agents=2,
              replay_start=50000,
              tter=.25,
              n_heads=3):
    from tensorflow.contrib import layers
    import gym
    from gym import wrappers
    import numpy as np

    chi.set_loglevel('info')
    log_top(logdir + '/logs/top')
    log_nvidia_smi(logdir + '/logs/nvidia-smi')

    memory = chi.rl.ReplayMemory(memory_size, 32)

    actions = [[0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]
    action_names = ['fw', 'bw', 'fw_r', 'bw_r', 'fw_l', 'bw_l']

    class RenderMeta(chi.rl.Wrapper):
        def __init__(self, env, limits=None, mod=False):
            super().__init__(env)
            self.mod = mod
            self.an = env.action_space.n
            # self.q_plotters = [Plotter(limits=None, title=f'A({n})') for n in action_names]
            self.f, ax = plt.subplots(2, 3, figsize=(3 * 3, 2 * 2), dpi=64)
            self.f.set_tight_layout(True)
            ax = iter(np.reshape(ax, -1))
            self.q = Plotter(next(ax), title='A', legend=action_names)
            self.r = Plotter(next(ax), limits=None, title='reward')
            self.mq = Plotter(next(ax), title='mq')
            self.vq = Plotter(next(ax), title='vq')
            self.td = Plotter(next(ax), title='td', auto_limit=12)
            self.pe = Plotter(next(ax), title='pred.err.', auto_limit=12)

        def _step(self, action):
            ob, r, done, info = super()._step(action)
            qs = self.meta.get('action_values', np.full(self.an, np.nan))
            qs -= np.mean(qs)
            # [qp.append(qs[i, ...]) for i, qp in enumerate(self.q_plotters)]

            self.q.append(qs)
            self.r.append(r)
            self.mq.append(self.meta.get('mq', 0))
            self.vq.append(self.meta.get('vq', 0))
            self.td.append(self.meta.get('td', 0))
            self.pe.append(self.meta.get('smse', 0))

            return ob, r, done, info

        def _render(self, mode='human', close=False):
            f = super()._render(mode, close)
            # fs = [qp.draw() for qp in self.q_plotters]
            f2 = draw(self.f)
            return chi.rl.util.concat_frames(f, self.obs, f2)

        def _observation(self, observation):
            self.obs = np.tile(observation[:, :, -1:], (1, 1, 3))
            return observation

    class NoiseWrapper(chi.rl.Wrapper):
        def _reset(self):
            self.mask = np.asarray(np.random.normal(
                0, 20, size=self.observation_space.shape),
                                   dtype=np.uint8)

            return super()._reset()

        def _observation(self, observation):
            np.clip(observation + self.mask, 0, 255, observation)
            return observation

    env_name = env  # no clue why this is necessary

    def make_env(i):
        env = env_name + 'NoFrameskip-v3'
        env = gym.make(env)
        env = chi.rl.wrappers.AtariWrapper(env)

        if i == 1:
            env = NoiseWrapper(env)

        env = chi.rl.wrappers.StackFrames(env, 4)
        env = wrappers.SkipWrapper(4)(env)

        if i in (0, 1):
            env = RenderMeta(env)

        env = wrappers.Monitor(env,
                               self.logdir + '/monitor_' + str(i),
                               video_callable=lambda j: j %
                               (20 if i in (0, 1) else 200) == 0)

        return env

    envs = [make_env(i) for i in range(agents)]
    env = envs[0]
    print_env(env)

    @chi.model(tracker=tf.train.ExponentialMovingAverage(1 - .0005),
               optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01))
    def pp(x):
        x /= 255
        x = layers.conv2d(x, 32, 8, 4)
        x = layers.conv2d(x, 64, 4, 2)
        x = layers.conv2d(x, 64, 3, 1)
        x = layers.flatten(x)
        return x

    @chi.model(tracker=tf.train.ExponentialMovingAverage(1 - .0005),
               optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01))
    def heads(x):
        qs = []
        for _ in range(n_heads):
            xv = layers.fully_connected(x, 512)
            val = layers.fully_connected(xv, 1, activation_fn=None)
            # val = tf.squeeze(val, 1)

            xa = layers.fully_connected(x, 512)
            adv = layers.fully_connected(xa,
                                         env.action_space.n,
                                         activation_fn=None)

            q = val + adv - tf.reduce_mean(adv, axis=1, keep_dims=True)
            q = tf.identity(q, name='Q')
            qs.append(q)

        return qs

    dqn = BootstrappedDQN(env.action_space.n,
                          env.observation_space.shape,
                          pp,
                          heads,
                          replay_start=replay_start,
                          logdir=logdir)

    for i, env in enumerate(envs):
        agent = dqn.make_agent(test=i in (0, 1),
                               train=i != 1,
                               memory_size=memory_size // (agents - 1),
                               logdir=logdir,
                               name=f'Agent_{i}')
        agent.run(env, async=True)

    dqn.train(timesteps, tter)
示例#8
0
def bdpg_chains2(self: Experiment, logdir=None, env=1, heads=3, n=50, bootstrap=False, sr=50000):
  from tensorflow.contrib import layers
  import gym
  from gym import spaces
  from gym import wrappers
  import numpy as np
  from tensorflow.contrib.framework import arg_scope

  def gym_make(id) -> gym.Env:
    return gym.make(id)

  chi.set_loglevel('debug')

  if env == 0:
    import gym_mix
    from chi.rl.wrappers import PenalizeAction
    env = gym_mix.envs.ChainEnv(n)
    env = PenalizeAction(env, .001, 1)
  elif env == 1:
    # env = gym.make('Pendulum-v0')
    env = gym.make('MountainCarContinuous-v0')

  if bootstrap:
    class Noise(Wrapper):
      def __init__(self, env):
        super().__init__(env)
        self.n = 3
        self.observation_space = gym.spaces.Box(
          np.concatenate((self.observation_space.low, np.full([self.n], -1))),
          np.concatenate((self.observation_space.high, np.full([self.n], 1))))

      def _reset(self):
        s = super()._reset()
        self.noise = np.random.uniform(-1, 1, [self.n])
        s = np.concatenate([s, self.noise])
        return s

      def _step(self, action):
        s, r, d, i = super()._step(action)
        s = np.concatenate([s, self.noise])
        return s, r, d, i

    env = Noise(env)

  print_env(env)

  def pp(x):
    # v = get_local_variable('noise', [x.shape[0], 100], initializer=tf.random_normal_initializer)
    # y = tf.concat(x, v)
    return x

  def ac(x):
    with tf.name_scope('actor_head'):
      x = layers.fully_connected(x, 50, biases_initializer=layers.xavier_initializer())
      x = layers.fully_connected(x, 50, biases_initializer=layers.xavier_initializer())
      # a = layers.fully_connected(x, env.action_space.shape[0], None, weights_initializer=tf.random_normal_initializer(0, 1e-4))
      a = layers.fully_connected(x, env.action_space.shape[0], None)
      return a

  def cr(x, a):
    with tf.name_scope('critic_head'):
      x = layers.fully_connected(x, 50, biases_initializer=layers.xavier_initializer())
      x = tf.concat([x, a], axis=1)
      x = layers.fully_connected(x, 50, biases_initializer=layers.xavier_initializer())
      # q = layers.fully_connected(x, 1, None, weights_initializer=tf.random_normal_initializer(0, 1e-4))
      q = layers.fully_connected(x, 1, None)
      return tf.squeeze(q, 1)

  if bootstrap:
    agent = DdpgAgent(env, ac, cr, replay_start=sr, noise=lambda a: a)
  else:
    agent = DdpgAgent(env, ac, cr, replay_start=sr)
  threshold = getattr(getattr(env, 'spec', None), 'reward_threshold', None)

  for ep in range(100000):

    R, info = agent.play_episode()

    if ep % 20 == 0:
      head = info.get('head')
      print(f'Return of episode {ep} after timestep {agent.t}: {R} (head = {head}, threshold = {threshold})')

    if ep % 100 == 0 and bootstrap:
      pass
示例#9
0
def bdpg_chains(self: Experiment,
                logdir=None,
                env=3,
                heads=3,
                n=10,
                bootstrap=True):
    from tensorflow.contrib import layers
    import gym
    from gym import spaces
    from gym import wrappers
    import numpy as np
    from tensorflow.contrib.framework import arg_scope

    def gym_make(id) -> gym.Env:
        return gym.make(id)

    chi.set_loglevel('debug')

    import gym_mix
    from chi.rl.wrappers import PenalizeAction
    env = gym_mix.envs.ChainEnv(n)
    env = PenalizeAction(env, .001, 1)

    print_env(env)

    def ac(x):
        with tf.name_scope('actor_head'):
            x = layers.fully_connected(
                x, 50, biases_initializer=layers.xavier_initializer())
            x = layers.fully_connected(
                x, 50, biases_initializer=layers.xavier_initializer())
            # a = layers.fully_connected(x, env.action_space.shape[0], None, weights_initializer=tf.random_normal_initializer(0, 1e-4))
            a = layers.fully_connected(x, env.action_space.shape[0], None)
            return a

    def cr(x, a):
        with tf.name_scope('critic_head'):
            x = layers.fully_connected(
                x, 50, biases_initializer=layers.xavier_initializer())
            x = tf.concat([x, a], axis=1)
            x = layers.fully_connected(
                x, 50, biases_initializer=layers.xavier_initializer())
            # q = layers.fully_connected(x, 1, None, weights_initializer=tf.random_normal_initializer(0, 1e-4))
            q = layers.fully_connected(x, 1, None)
            return tf.squeeze(q, 1)

    if bootstrap:
        agent = BdpgAgent(env, ac, cr, heads=heads, replay_start=5000)
    else:
        agent = DdpgAgent(env, ac, cr, replay_start=5000)
    threshold = getattr(getattr(env, 'spec', None), 'reward_threshold', None)

    for ep in range(100000):

        R, info = agent.play_episode()

        if ep % 20 == 0:
            head = info.get('head')
            print(
                f'Return of episode {ep} after timestep {agent.t}: {R} (head = {head}, threshold = {threshold})'
            )
示例#10
0
文件: main.py 项目: rmst/chi
def chiboard(self: chi.Experiment, host='localhost', port=MAGIC_PORT, rootdir='',
             loglevel='debug', timeout=24*60*60, port_pool=""):
  from flask import Flask, jsonify, send_from_directory, send_file
  from chi.board.server import Server
  from chi.board.util import rcollect
  from chi.board.util import get_free_port
  from chi.logger import logger

  import os
  import signal
  from time import time, sleep
  from threading import Thread
  from os.path import expanduser as expandu
  from flask_socketio import SocketIO

  def expanduser(p):
    pa = expandu(p)
    return pa if pa.startswith('/') else '/' + pa

  chi.set_loglevel(loglevel)

  if port == 0:
    port = get_free_port(host)
    print(f'{port}')

  self.config.port = port

  p = os.path.dirname(os.path.realpath(__file__))
  app = Flask(__name__, root_path=p, static_url_path='/')

  socketio = SocketIO(app)

  if rootdir == '':
    import os
    rootdir = os.environ.get('CHI_EXPERIMENTS') or '~'
    logger.debug('Rootdir: ' + rootdir)

  if port_pool:
    port_pool = [int(p) for p in port_pool.split(',')]
  else:
    port_pool = range(port + 1, port + 30)

  server = Server(host, port, rootdir, port_pool)

  remotes = []
  p = expanduser('~/.chi/board/remotes.json')
  if os.path.exists(p):
    with open(p) as f:
      remotes = json.load(f)
      # print(remotes)

  state = dict(last_request=time())

  def killer():
    while time() - state['last_request'] < timeout:
      sleep(2)
    logger.error('timeout')
    os.kill(os.getpid(), signal.SIGINT)  # kill self

  Thread(target=killer, daemon=True).start()

  @app.before_request
  def tick():
    state.update(last_request=time())

  @app.route("/")
  def index():
    return send_file("components/index.html")

  @app.route("/favicon")
  def favicon():
    return send_file("components/favicon.png")

  @app.route('/bower_components/<path:path>')
  def bower(path):
    return send_from_directory('bower_components', path)

  @app.route('/components/<path:path>')
  def comp(path):
    return send_from_directory('components', path)

  @app.route("/exp/")
  def exp():
    return send_file("components/experiment.html")

  @app.route("/info/<string:host>/<path:path>")  # experiment page
  def info(host, path):
    if host == 'local':
      return jsonify(server.info(expanduser(path)))
    else:
      raise Exception('Remote not yet supported')
      # request scripts info
      # update urls

  @app.route("/logs/<path:path>")
  def logs(path):
    data = []

    def key(x):
      k = '_' if x == 'stdout' else x
      return k

    path = expanduser(path) + '/logs'

    for p in sorted(os.listdir(path), key=key):
      with open(path + '/' + p, 'r') as f:
        f.seek(0, os.SEEK_END)
        l = f.tell()
        f.seek(max((0, l - 50000)), 0)
        c = f.read()
        while c and c[-1] == '\n':
          c = c[:-1]
        # c = c.replace('\n', '<br>')
        # c = c.replace('<', '&lt;')
        data.append({'name': os.path.basename(p), 'content': c})

    return jsonify(data)

  @app.route("/tb/<string:host>/<path:path>")
  def tb(host, path):
    if host == 'local':
      return jsonify(server.tensorboard(expanduser(path)))
    else:
      raise Exception('Remote not yet supported')
      # make local port forward
      # request scripts tensorboard
      # update urls

  @app.route("/delete/<path:path>")
  def delete(path):
    return jsonify(server.delete(expanduser(path)))

  @app.route("/trend/<path:path>")
  def trend(path):
    sio = server.trend('/' + path)
    return send_file(sio, attachment_filename='trend.png', mimetype='image/png')

  @app.route("/<string:cmd>/<path:path>")
  def command(cmd, path):
    return jsonify(server.command(cmd, expanduser(path)))

  try:
    socketio.on_namespace(server)
    socketio.run(app, host=host, port=port, log_output=loglevel == 'debug')
  finally:
    server.shutdown()