def make_network(env, h=None, w=None): with env.create_network() as net: if h is None: img = O.placeholder('img', shape=(1, None, None, 3)) else: img = O.variable('img', np.zeros([1, h, w, 3])) net.add_output(img, name='img') _ = img _ = _ - get_env('neural_style.image_mean').reshape(1, 1, 1, 3) _ = O.pad_rb_multiple_of(_, 32) def stacked_conv(prefix, nr_convs, in_, channel, kernel=(3, 3), padding='SAME', nonlin=O.relu): for i in range(1, nr_convs + 1): in_ = O.conv2d('{}_{}'.format(prefix, i), in_, channel, kernel, padding=padding, nonlin=nonlin) return in_ _ = stacked_conv('conv1', 2, _, 64) _ = O.pooling2d('pool1', _, (2, 2)) _ = stacked_conv('conv2', 2, _, 128) _ = O.pooling2d('pool2', _, (2, 2)) _ = stacked_conv('conv3', 3, _, 256) _ = O.pooling2d('pool3', _, (2, 2)) _ = stacked_conv('conv4', 3, _, 512) _ = O.pooling2d('pool4', _, (2, 2)) _ = stacked_conv('conv5', 3, _, 512) _ = O.pooling2d('pool5', _, (2, 2)) for l in get_env('neural_style.content_layers'): net.add_output(net.find_var_by_name(l[0] + '/bias'), name=l[0]) for l in get_env('neural_style.style_layers'): net.add_output(net.find_var_by_name(l[0] + '/bias'), name=l[0])
def make_dataflow_train(env): rng = random.gen_rng() def _outputs2action(outputs): epsilon = env.runtime['exp_epsilon'] return outputs['q_argmax'] if rng.rand() > epsilon else rng.choice( get_player_nr_actions()) collector = rl.train.SynchronizedExperienceCollector( env, make_player, _outputs2action, nr_workers=get_env('dqn.collector.nr_workers'), nr_predictors=get_env('dqn.collector.nr_workers'), predictor_output_names=get_env('dqn.collector.predictor_output_names'), mode=get_env('dqn.collector.mode')) return rl.train.QLearningDataFlow(collector, target=get_env('dqn.collector.target'), maxsize=get_env('dqn.expreplay.maxsize'), batch_size=get_env('trainer.batch_size'), epoch_size=get_env('trainer.epoch_size'), gamma=get_env('dqn.gamma'), nr_td_steps=get_env('dqn.nr_td_steps'), reward_cb=lambda r: np.clip(r, -1, 1))
def make_step(net): """iter only one step, providing end""" imgvar = net.outputs['img'] target = net.outputs['end'] netin = imgvar # random draw ox, oy jitter = get_env('deep_dream.jitter') ox, oy = np.random.randint(-jitter, jitter + 1, 2) img = netin.get_value() img = np.roll(np.roll(img, ox, 2), oy, 1) # apply jitter shift # compute the gradient # one shuold note that we are actually use L2 loss for an activation map to # to compute the gradient for the input netin.set_value(img) loss = 0.5 * (target**2.).mean() grad = O.grad(loss, imgvar) grad = grad.eval() # apply gradient ascent, with normalized gradient img += get_env('deep_dream.learning_rate') / np.abs(grad).mean() * grad img = np.clip(img, 0, 255) img = np.roll(np.roll(img, -ox, 2), -oy, 1) # unshift image netin.set_value(img)
def make_optimizer(env): wrapper = optimizer.OptimizerWrapper() lr = optimizer.make_optimizer_variable( 'learning_rate', get_env('trainer.policy_learning_rate'), prefix='policy_') wrapper.set_base_optimizer(optimizer.base.AdamOptimizer(lr, epsilon=1e-3)) wrapper.append_grad_modifier( optimizer.grad_modifier.LearningRateMultiplier([ ('*/b', 2.0), ])) env.set_policy_optimizer(wrapper) use_linear_vr = get_env('ppo.use_linear_vr') if not use_linear_vr: wrapper = optimizer.OptimizerWrapper() lr = optimizer.make_optimizer_variable( 'learning_rate', get_env('trainer.value_learning_rate'), prefix='value_') wrapper.set_base_optimizer( optimizer.base.AdamOptimizer(lr, epsilon=1e-3)) wrapper.append_grad_modifier( optimizer.grad_modifier.LearningRateMultiplier([ ('*/b', 2.0), ])) env.set_value_optimizer(wrapper)
def main_demo(env, func): df = iter(make_dataflow_demo(env)) nr_samples = get_env('demo.nr_samples', 40 * 8) grid_desc = get_env('demo.grid_desc', ('20v', '16h')) while True: all_imgs_ab = [] all_imgs_ba = [] for i in range(nr_samples): feed_dict = next(df) results = func(**feed_dict) img_a, img_b = feed_dict['img_a'][0], feed_dict['img_b'][0] img_ab, img_ba = results['img_ab'][0] * 255, results['img_ba'][ 0] * 255 img_aba, img_bab = results['img_aba'][0] * 255, results['img_bab'][ 0] * 255 all_imgs_ab.append(np.hstack([img_a, img_ab]).astype('uint8')) all_imgs_ba.append(np.hstack([img_b, img_ba]).astype('uint8')) all_imgs_ab = image.image_grid(all_imgs_ab, grid_desc) all_imgs_ba = image.image_grid(all_imgs_ba, grid_desc) sep = np.ones((all_imgs_ab.shape[0], 64, 3), dtype='uint8') * 255 all_imgs = np.hstack([all_imgs_ab, sep, all_imgs_ba]) image.imwrite('discogan.png', all_imgs) image.imshow('AtoB; BtoA', all_imgs)
def make_player(is_train=True, dump_dir=None): p = rl.GymRLEnviron(get_env('a3c.env_name'), dump_dir=dump_dir) p = rl.HistoryFrameProxyRLEnviron(p, get_env('a3c.nr_history_frames')) p = rl.LimitLengthProxyRLEnviron(p, get_env('a3c.max_nr_steps')) if is_train: p = rl.AutoRestartProxyRLEnviron(p) return p
def main_train(trainer): from tartist.app.rl.utils.adv import GAEComputer from tartist.random.sampler import SimpleBatchSampler trainer.set_adv_computer( GAEComputer(get_env('ppo.gamma'), get_env('ppo.gae.lambda'))) trainer.set_batch_sampler( SimpleBatchSampler(get_env('trainer.batch_size'), get_env('trainer.data_repeat'))) # Register plugins. from tartist.plugins.trainer_enhancer import summary summary.enable_summary_history(trainer, extra_summary_types={ 'inference/score': 'async_scalar', }) summary.enable_echo_summary_scalar( trainer, summary_spec={'inference/score': ['avg', 'max']}) from tartist.plugins.trainer_enhancer import progress progress.enable_epoch_progress(trainer) from tartist.plugins.trainer_enhancer import snapshot snapshot.enable_snapshot_saver(trainer, save_interval=1) def on_epoch_after(trainer): if trainer.epoch > 0 and trainer.epoch % 2 == 0: main_inference_play_multithread(trainer) # This one should run before monitor. trainer.register_event('epoch:after', on_epoch_after, priority=5) trainer.train()
def parse_history(history): num = len(history) if is_over: r = 0 env.players_history[identifier] = [] elif num == get_env('a3c.nr_td_steps') + 1: history, last = history[:-1], history[-1] r = last.value env.players_history[identifier] = [last] else: return gamma = get_env('a3c.gamma') for i in history[::-1]: r = np.clip(i.reward, -1, 1) + gamma * r try: # MJY(20170910):: No wait!!! We need post_state. if env.rpredictor.waiting_for_data.is_set(): data_queue.put_nowait({ 'state': i.state, 'action': i.action, 'future_reward': r }) else: # Still set a timeout. data_queue.put( { 'state': i.state, 'action': i.action, 'future_reward': r }, timeout=1) except queue.Full: pass
def make_dataflow_train(env): batch_size = get_env('trainer.batch_size') dfs = [_make_dataflow(batch_size, use_prefetch=True) for i in range(2)] df = gan.GANDataFlow(dfs[0], dfs[1], get_env('trainer.nr_g_per_iter', 1), get_env('trainer.nr_d_per_iter', 1)) return df
def make_dataflow_train(env): ensure_load() batch_size = get_env('trainer.batch_size') df = _mnist[0] df = flow.DOARandomSampleDataFlow(df) df = flow.BatchDataFlow(df, batch_size, sample_dict={'img': np.empty(shape=(batch_size, 28, 28, 1), dtype='float32'), }) df = gan.GANDataFlow(None, df, get_env('trainer.nr_g_per_iter', 1), get_env('trainer.nr_d_per_iter', 1)) return df
def main_demo(env, func): func.compile(env.network.outputs['q_argmax']) dump_dir = get_env('dir.demo', os.path.join(get_env('dir.root'), 'demo')) logger.info('Demo dump dir: {}'.format(dump_dir)) player = make_player(dump_dir=dump_dir) repeat_time = get_env('dqn.demo.nr_plays', 1) for i in range(repeat_time): player.play_one_episode( func=lambda state: func(state=state[np.newaxis])[0]) logger.info('#{} play score={}'.format(i, player.stats['score'][-1]))
def main_demo(env, func): dump_dir = get_env('dir.demo', os.path.join(get_env('dir.root'), 'demo')) logger.info('Demo dump dir: {}'.format(dump_dir)) player = make_player(dump_dir=dump_dir) repeat_time = get_env('cem.demo.nr_plays', 1) def get_action(inp, func=func): policy = func(state=inp[np.newaxis])['policy'][0] return _policy2action(policy) for i in range(repeat_time): player.play_one_episode(get_action) logger.info('#{} play score={}'.format(i, player.stats['score'][-1]))
def make_dataflow_inference(env): ensure_load() batch_size = get_env('inference.batch_size') epoch_size = get_env('inference.epoch_size') df = _mnist[1] # use validation set actually df = flow.DictOfArrayDataFlow(df) df = flow.tools.cycle(df) df = flow.BatchDataFlow(df, batch_size, sample_dict={'img': np.empty(shape=(batch_size, 28, 28, 1), dtype='float32'), }) df = flow.EpochDataFlow(df, epoch_size) return df
def main_demo(env, func): dump_dir = get_env('dir.demo', os.path.join(get_env('dir.root'), 'demo')) logger.info('demo dump dir: {}'.format(dump_dir)) player = make_player(is_train=False, dump_dir=dump_dir) repeat_time = get_env('a3c.demo.nr_plays', 1) def get_action(inp, func=func): action = func(**{'state': [[inp]]})['policy'][0].argmax() return action for i in range(repeat_time): player.play_one_episode(get_action) logger.info('#{} play score={}'.format(i, player.stats['score'][-1]))
def make_player(is_train=True, dump_dir=None): def resize_state(s): return image.resize(s, get_env('a3c.input_shape'), interpolation='NEAREST') p = rl.GymRLEnviron(get_env('a3c.env_name'), dump_dir=dump_dir) p = rl.MapStateProxyRLEnviron(p, resize_state) p = rl.HistoryFrameProxyRLEnviron(p, get_env('a3c.nr_history_frames')) p = rl.LimitLengthProxyRLEnviron(p, get_env('a3c.max_nr_steps')) if is_train: p = rl.AutoRestartProxyRLEnviron(p) else: p = rl.GymPreventStuckProxyRLEnviron(p, get_env('a3c.inference.max_antistuck_repeat'), 1) return p
def make_dataflow_train(env): def _outputs2action(outputs): return outputs['policy'] collector = rl.train.SynchronizedExperienceCollector( env, make_player, _outputs2action, nr_workers=get_env('ppo.collector.nr_workers'), nr_predictors=get_env('ppo.collector.nr_workers'), predictor_output_names=get_env('ppo.collector.predictor_output_names'), mode='EPISODE-STEP') return rl.train.SynchronizedTrajectoryDataFlow( collector, target=get_env('ppo.collector.target'), incl_value=True)
def initialize_all_peers(self): nr_players = get_env('a3c.nr_players') self._player_master.initialize() self.initialize_all_variables() self._player_master.start(nr_players, daemon=True) self._inference_player_master.initialize()
def initialize_a3c(self): nr_predictors = get_env('a3c.nr_predictors') # making net funcs self._net_funcs = [] all_devices = self.slave_devices if len(all_devices) == 0: all_devices = self.all_devices for i in range(nr_predictors): dev = all_devices[i % len(all_devices)] func = self._make_predictor_net_func(i, dev) self._net_funcs.append(func) self._player_master = A3CMaster(self, 'a3c-player', nr_predictors) self._inference_player_master = A3CMaster(self, 'a3c-inference-player', nr_predictors) self._data_queue = queue.Queue(get_env('trainer.batch_size') * get_env('a3c.data_queue_length_factor', 16))
def _predictor_func(pid, router, task_queue, func, is_inference=False): batch_size = get_env('a3c.predictor.batch_size') batched_state = np.empty((batch_size, ) + get_input_shape(), dtype='float32') while True: callbacks = [] nr_total = 0 for i in range(batch_size): if i == 0 or not is_inference: identifier, inp, callback = task_queue.get() else: try: identifier, inp, callback = task_queue.get_nowait() except queue.Empty: break batched_state[i] = inp[0] callbacks.append(callback) nr_total += 1 out = func(state=batched_state[:nr_total]) for i in range(nr_total): if is_inference: action = out['policy'][i] else: action = sample_action(out['policy_explore'][i]) callbacks[i](action, out['value'][i])
def make_optimizer(env): wrapper = optimizer.OptimizerWrapper() wrapper.set_base_optimizer(optimizer.base.MomentumOptimizer(get_env('trainer.learning_rate'), 0.9)) wrapper.append_grad_modifier(optimizer.grad_modifier.LearningRateMultiplier([ ('*/b', 2.0), ])) env.set_optimizer(wrapper)
def _predictor_func(pid, router, task_queue, func, is_inference=False): batch_size = get_env('a3c.predictor.batch_size') batched_state = np.empty((batch_size, ) + get_input_shape(), dtype='float32') while True: callbacks = [] nr_total = 0 for i in range(batch_size): if i == 0 or not is_inference: identifier, inp, callback = task_queue.get() else: try: identifier, inp, callback = task_queue.get_nowait() except queue.Empty: break batched_state[i] = inp[0] callbacks.append(callback) nr_total += 1 out = func(state=batched_state) for i in range(nr_total): policy = out['policy_explore'][i] if is_inference: # during inference, policy should be out['policy'][i] # but these two are equivalent under argmax operation # and we can only compile 'policy_explore' in output action = policy.argmax() else: action = random.choice(len(policy), p=policy) callbacks[i](action, out['value'][i])
def sample_action(policy): space = get_env('a3c.actor_space') action = [] for i, s in enumerate(space): a = random.choice(len(s), p=policy[i]) action.append(a) return action
def demo(feed_dict, result, extra_info): mode = get_env('demo.mode', 'vae') assert mode in ('vae', 'draw') if mode == 'vae': demo_vae(feed_dict, result, extra_info) elif mode == 'draw': demo_draw(feed_dict, result, extra_info)
def make_optimizer(env): opt = rl.train.TRPOOptimizer(env, max_kl=get_env('trpo.max_kl'), cg_damping=get_env('trpo.cg.damping')) env.set_policy_optimizer(opt) use_linear_vr = get_env('trpo.use_linear_vr') if not use_linear_vr: wrapper = optimizer.OptimizerWrapper() wrapper.set_base_optimizer( optimizer.base.AdamOptimizer( get_env('trainer.value_learning_rate'), epsilon=1e-3)) wrapper.append_grad_modifier( optimizer.grad_modifier.LearningRateMultiplier([ ('*/b', 2.0), ])) env.set_value_optimizer(wrapper)
def json_summary_enable(trainer, js_path=json_path): if js_path is None: js_path = osp.join(get_env('dir.root'), 'summary.json') restored = 'restore_snapshot' in trainer.runtime if osp.exists(js_path) and not restored: logger.warn('Removing old summary json: {}.'.format(js_path)) os.remove(js_path) trainer.runtime['json_summary_path'] = js_path
def main_demo(env, func): mode = get_env('demo.mode') assert mode is not None if mode == 'infogan': main_demo_infogan(env, func) else: assert False, 'Unknown mode {}'.format(mode)
def make_network(env): with env.create_network() as net: code_length = 20 h, w, c = 28, 28, 1 is_reconstruct = get_env('demo.is_reconstruct', False) dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): img = O.placeholder('img', shape=(None, h, w, c)) return [img] def forward(x): if is_reconstruct or env.phase is env.Phase.TRAIN: with env.variable_scope('encoder'): _ = x _ = O.fc('fc1', _, 500, nonlin=O.tanh) _ = O.fc('fc2', _, 500, nonlin=O.tanh) mu = O.fc('fc3_mu', _, code_length) log_var = O.fc('fc3_sigma', _, code_length) var = O.exp(log_var) std = O.sqrt(var) epsilon = O.random_normal([x.shape[0], code_length]) z_given_x = mu + std * epsilon else: z_given_x = O.random_normal([1, code_length]) with env.variable_scope('decoder'): _ = z_given_x _ = O.fc('fc1', _, 500, nonlin=O.tanh) _ = O.fc('fc2', _, 500, nonlin=O.tanh) _ = O.fc('fc3', _, 784, nonlin=O.sigmoid) _ = _.reshape(-1, h, w, c) x_given_z = _ if env.phase is env.Phase.TRAIN: with env.variable_scope('loss'): content_loss = O.raw_cross_entropy_prob( 'raw_content', x_given_z.flatten2(), x.flatten2()) content_loss = content_loss.sum(axis=1).mean( name='content') # distrib_loss = 0.5 * (O.sqr(mu) + O.sqr(std) - 2. * O.log(std + 1e-8) - 1.0).sum(axis=1) distrib_loss = -0.5 * (1. + log_var - O.sqr(mu) - var).sum(axis=1) distrib_loss = distrib_loss.mean(name='distrib') loss = content_loss + distrib_loss dpc.add_output(loss, name='loss', reduce_method='sum') dpc.add_output(x_given_z, name='output') dpc.set_input_maker(inputs).set_forward_func(forward) net.add_all_dpc_outputs(dpc, loss_name='loss') if env.phase is env.Phase.TRAIN: summary.inference.scalar('loss', net.loss)
def ensure_load(cifar_num_classes): global _cifar if len(_cifar) == 0: for xy in load_cifar(get_env('dir.data'), cifar_num_classes): _cifar.append( dict(img=xy[0].astype('float32').reshape( -1, _cifar_img_dim, _cifar_img_dim, 3), label=xy[1]))
def make_dataflow_train(env): num_classes = get_env('dataset.nr_classes') ensure_load(num_classes) batch_size = get_env('trainer.batch_size') df = _cifar[0] df = flow.DOARandomSampleDataFlow(df) df = flow.BatchDataFlow(df, batch_size, sample_dict={ 'img': np.empty(shape=(batch_size, _cifar_img_dim, _cifar_img_dim, 3), dtype='float32'), 'label': np.empty(shape=(batch_size, ), dtype='int32') }) return df
def make_rpredictor_optimizer(env): wrapper = optimizer.OptimizerWrapper() wrapper.set_base_optimizer( optimizer.base.AdamOptimizer(get_env('rpredictor.learning_rate'), epsilon=1e-3)) wrapper.append_grad_modifier( optimizer.grad_modifier.LearningRateMultiplier([ ('*/b', 2.0), ])) env.set_optimizer(wrapper)