def __init__(self, config, datadir, actspace, writer): self._c = config self._actspace = actspace self._actdim = actspace.n if hasattr(actspace, 'n') else actspace.shape[0] self._writer = writer self._random = np.random.RandomState(config.seed) with tf.device('cpu:0'): self._step = tf.Variable(count_steps(datadir, config), dtype=tf.int64) self._should_pretrain = tools.Once() self._should_train = tools.Every(config.train_every) self._should_log = tools.Every(config.log_every) self._last_log = None self._last_time = time.time() self._metrics = collections.defaultdict(tf.metrics.Mean) self._metrics['expl_amount'] # Create variable for checkpoint. self._float = prec.global_policy().compute_dtype self._strategy = tf.distribute.MirroredStrategy() with self._strategy.scope(): self._dataset = iter( self._strategy.experimental_distribute_dataset( load_dataset(datadir, self._c))) self._build_model() print(f'model_lr:{self._c.model_lr}') print(f'actor_lr:{self._c.actor_lr}') print(f'value_lr:{self._c.value_lr}') print(f'grad_clip:{self._c.grad_clip}') print(f'batch_size:{self._c.batch_size}') print(f'deter_size:{self._c.deter_size}') print(f'stoch_size:{self._c.stoch_size}') print(f'kl_scale:{self._c.kl_scale}')
def __init__(self, config, datadir, actspace, writer): self._c = config self._actspace = actspace self._actdim = actspace.n if hasattr(actspace, 'n') else actspace.shape[0] self._writer = writer self._random = np.random.RandomState(config.seed) with tf.device('cpu:0'): self._step = tf.Variable(count_steps(datadir, config), dtype=tf.int64) self._should_pretrain = tools.Once() self._should_train = tools.Every(config.train_every) self._should_log = tools.Every(config.log_every) self._last_log = None self._last_time = time.time() self._metrics = collections.defaultdict(tf.metrics.Mean) self._metrics['expl_amount'] # Create variable for checkpoint. self._float = prec.global_policy().compute_dtype self._strategy = tf.distribute.MirroredStrategy( devices=["/gpu:" + str(0)]) print("device_num:", self._strategy.num_replicas_in_sync) with self._strategy.scope(): self._dataset = iter( self._strategy.experimental_distribute_dataset( load_dataset(datadir, self._c))) self._build_model()
def __init__(self, config, logger, dataset): self._config = config self._logger = logger self._float = prec.global_policy().compute_dtype self._should_log = tools.Every(config.log_every) self._should_train = tools.Every(config.train_every) self._should_pretrain = tools.Once() self._should_reset = tools.Every(config.reset_every) self._should_expl = tools.Until( int(config.expl_until / config.action_repeat)) self._metrics = collections.defaultdict(tf.metrics.Mean) with tf.device('cpu:0'): self._step = tf.Variable(count_steps(config.traindir), dtype=tf.int64) # Schedules. config.actor_entropy = ( lambda x=config.actor_entropy: tools.schedule(x, self._step)) config.actor_state_entropy = ( lambda x=config.actor_state_entropy: tools.schedule(x, self._step)) config.imag_gradient_mix = ( lambda x=config.imag_gradient_mix: tools.schedule(x, self._step)) self._dataset = iter(dataset) self._wm = models.WorldModel(self._step, config) self._task_behavior = models.ImagBehavior(config, self._wm, config.behavior_stop_grad) reward = lambda f, s, a: self._wm.heads['reward'](f).mode() self._expl_behavior = dict( greedy=lambda: self._task_behavior, random=lambda: expl.Random(config), plan2explore=lambda: expl.Plan2Explore(config, self._wm, reward), )[config.expl_behavior]() # Train step to initialize variables including optimizer statistics. self._train(next(self._dataset))