def default(task, config): defaults = dict(subsample=2, frame_skip=4, history=4, delta=False, frame_max=2, noop_max=30) config = mp.utility.use_attrdicts(mp.utility.merge_dicts(defaults, config)) policy = mp.Sequential(task) policy.add(mp.step.Image) if config.noop_max: policy.add(mp.step.RandomStart, config.noop_max) if config.frame_skip > 1: policy.add(mp.step.Skip, config.frame_skip) if config.frame_max: policy.add(mp.step.Maximum, config.frame_max) if config.history > 1: channels = policy.above_task.observs.shape[-1] policy.add(mp.step.Grayscale, (0.299, 0.587, 0.114)[:channels]) if config.subsample > 1: sub = config.subsample amount = (sub, sub) if config.history > 1 else (sub, sub, 1) policy.add(mp.step.Subsample, amount) if config.delta: policy.add(mp.step.Delta) if config.history > 1: policy.add(mp.step.History, config.history) policy.add(mp.step.Normalize) policy.add(mp.step.ClampReward) return policy
def train_policies(self): heads = [] for _ in range(self.config.heads): policy = mp.Sequential(self.task) policy.add(self._create_preprocess()) policy.add(Head, self) heads.append(policy) return heads
def dqn_2015(task, config=None): policy = mp.Sequential(task) policy.add(mp.step.Image) policy.add(mp.step.RandomStart, 30) policy.add(mp.step.Skip, 4) policy.add(mp.step.Maximum, 2) policy.add(mp.step.Grayscale, (0.299, 0.587, 0.114)) policy.add(mp.step.Subsample, (2, 2)) policy.add(mp.step.History, 4) policy.add(mp.step.Normalize) policy.add(mp.step.ClampReward) return policy
def train_policies(self): trainers = [] for _ in range(self.config.learners): config = mp.utility.AttrDict(self.config.copy()) # TODO: Use single model to share RMSProp statistics. Does RMSProp # use statistics in compute_gradients() or apply_gradients()? model = mp.model.Model(self._create_network, threads=1) model.weights = self.model.weights policy = mp.Sequential(self.task) policy.add(self._create_preprocess()) policy.add(Train, config, self, model) trainers.append(policy) return trainers
def policy(self): # TODO: Why doesn't self.task work here? policy = mp.Sequential(self._preprocess.task) policy.add(self._preprocess) policy.add(self) return policy
def _create_preprocess(self): policy = mp.Sequential(self.task) preprocess = getattr(mp.part.preprocess, self.config.preprocess) policy.add(preprocess, self.config.preprocess_config) policy.add(mp.step.EpsilonGreedy, **self.config.epsilon) return policy
def _create_preprocess(self): policy = mp.Sequential(self.task) preprocess = getattr(mp.part.preprocess, self.config.preprocess) policy.add(preprocess, self.config.preprocess_config) return policy
def test_policy(self): policy = mp.Sequential(self.task) policy.add(self._preprocess) policy.add(Test, self.model, self) return policy
def _prepend_score_step(self, policy): combined = mp.Sequential(policy.task) combined.add(mp.step.Score) combined.add(policy) return combined