def make_network(env, h=None, w=None): with env.create_network() as net: if h is None: img = O.placeholder('img', shape=(1, None, None, 3)) else: img = O.variable('img', np.zeros([1, h, w, 3])) net.add_output(img, name='img') _ = img _ = _ - get_env('neural_style.image_mean').reshape(1, 1, 1, 3) _ = O.pad_rb_multiple_of(_, 32) def stacked_conv(prefix, nr_convs, in_, channel, kernel=(3, 3), padding='SAME', nonlin=O.relu): for i in range(1, nr_convs + 1): in_ = O.conv2d('{}_{}'.format(prefix, i), in_, channel, kernel, padding=padding, nonlin=nonlin) return in_ _ = stacked_conv('conv1', 2, _, 64) _ = O.pooling2d('pool1', _, (2, 2)) _ = stacked_conv('conv2', 2, _, 128) _ = O.pooling2d('pool2', _, (2, 2)) _ = stacked_conv('conv3', 3, _, 256) _ = O.pooling2d('pool3', _, (2, 2)) _ = stacked_conv('conv4', 3, _, 512) _ = O.pooling2d('pool4', _, (2, 2)) _ = stacked_conv('conv5', 3, _, 512) _ = O.pooling2d('pool5', _, (2, 2)) for l in get_env('neural_style.content_layers'): net.add_output(net.find_var_by_name(l[0] + '/bias'), name=l[0]) for l in get_env('neural_style.style_layers'): net.add_output(net.find_var_by_name(l[0] + '/bias'), name=l[0])
def make_param_gs(env, var_list, name_scope): var_shapes = [as_tftensor(v).get_shape().as_list() for v in var_list] for vs, v in zip(var_shapes, var_list): assert None not in vs, 'Could not determine the shape for optimizable variable: {}.'.format( v) var_nr_elems = [ as_tftensor(v).get_shape().num_elements() for v in var_list ] nr_total_elems = sum(var_nr_elems) param_nr_elems = nr_total_elems with env.name_scope(name_scope): # Parameter getter param_getter = vectorize_var_list(var_list) # Parameter setter flat_variables_tensor = O.placeholder('flat_variable_tensor', shape=(nr_total_elems, )) var_assigns = [] index = 0 for v, vs, vn in zip(var_list, var_shapes, var_nr_elems): value = flat_variables_tensor[index:index + vn].reshape(vs) # Use tf.assign because tf.group use non-3rdparty-compatible codes. var_assigns.append( tf.assign(v, value, name='assign_{}'.format(escape_name(v)))) index += vn param_setter = tf.group(*var_assigns) param_provider = as_tftensor(flat_variables_tensor) return param_nr_elems, param_getter, param_setter, param_provider
def testPaddingCenter(self): a = O.placeholder('a', shape=(16, 15, 15, 3)) b = O.pad_center(a, [17, 17]) self.assertTupleEqual(b.static_shape, (16, 17, 17, 3)) avar = np.random.normal(size=(16, 15, 15, 3)) bvar = np.pad(avar, [[0, 0], [1, 1], [1, 1], [0, 0]], mode='constant') self.assertTensorClose(b.eval(a=avar), bvar)
def testPaddingRBMultiple(self): a = O.placeholder('a', shape=(16, 15, 15, 3)) b = O.pad_rb_multiple_of(a, 8) self.assertTupleEqual(b.static_shape, (16, 16, 16, 3)) avar = np.random.normal(size=(16, 15, 15, 3)) bvar = np.pad(avar, [[0, 0], [0, 1], [0, 1], [0, 0]], mode='constant') self.assertTensorClose(b.eval(a=avar), bvar)
def testCropLU(self): a = O.placeholder('a', shape=(16, 17, 17, 3)) b = O.crop_lu(a, [15, 15]) self.assertTupleEqual(b.static_shape, (16, 15, 15, 3)) avar = np.random.normal(size=(16, 17, 17, 3)) bvar = avar[:, :-2, :-2, :] self.assertTensorClose(b.eval(a=avar), bvar)
def make_network(env): with env.create_network() as net: dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): h, w, c = 28, 28, 1 img = O.placeholder('img', shape=(None, h, w, c)) return [img] def forward(img): _ = img _ = O.conv2d('conv1', _, 16, (3, 3), padding='SAME', nonlin=O.identity) _ = O.batch_norm('bn1', _) _ = O.relu(_) _ = O.pooling2d('pool1', _, kernel=2) _ = O.conv2d('conv2', _, 32, (3, 3), padding='SAME', nonlin=O.identity) _ = O.batch_norm('bn2', _) _ = O.relu(_) _ = O.pooling2d('pool2', _, kernel=2) dpc.add_output(_, name='feature') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature'] _ = O.fc('fc1', _, 64) _ = O.fc('fc2', _, 10) prob = O.softmax(_, name='prob') pred = _.argmax(axis=1).astype('int32', name='pred') net.add_output(prob) net.add_output(pred) if env.phase is env.Phase.TRAIN: label = O.placeholder('label', shape=(None, ), dtype='int32') loss = O.sparse_softmax_cross_entropy_with_logits( logits=_, labels=label).mean() loss = O.identity(loss, name='loss') net.set_loss(loss) accuracy = O.eq(label, pred).astype('float32').mean() error = 1. - accuracy summary.scalar('accuracy', accuracy) summary.scalar('error', error) summary.inference.scalar('loss', loss) summary.inference.scalar('accuracy', accuracy) summary.inference.scalar('error', error)
def make_network(env): with env.create_network() as net: nr_classes = get_env('dataset.nr_classes') conv_bn_relu = functools.partial(O.conv2d, nonlin=O.bn_relu) conv2d = conv_bn_relu dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): h, w, c = 32, 32, 3 img = O.placeholder('img', shape=(None, h, w, c)) return [img] def forward(img): _ = img _ = conv2d('conv1.1', _, 16, (3, 3), padding='SAME') _ = conv2d('conv1.2', _, 16, (3, 3), padding='SAME') _ = O.pooling2d('pool1', _, kernel=3, stride=2) _ = conv2d('conv2.1', _, 32, (3, 3), padding='SAME') _ = conv2d('conv2.2', _, 32, (3, 3), padding='SAME') _ = O.pooling2d('pool2', _, kernel=3, stride=2) _ = conv2d('conv3.1', _, 64, (3, 3), padding='VALID') _ = conv2d('conv3.2', _, 64, (3, 3), padding='VALID') _ = conv2d('conv3.3', _, 64, (3, 3), padding='VALID') dpc.add_output(_, name='feature') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature'] _ = O.fc('fc1', _, 128, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) _ = O.fc('linear', _, nr_classes) prob = O.softmax(_, name='prob') pred = _.argmax(axis=1).astype('int32', name='pred') net.add_output(prob) net.add_output(pred) if env.phase is env.Phase.TRAIN: label = O.placeholder('label', shape=(None, ), dtype='int32') loss = O.sparse_softmax_cross_entropy_with_logits(logits=_, labels=label).mean() loss = O.identity(loss, name='loss') net.set_loss(loss) accuracy = O.eq(label, pred).astype('float32').mean() error = 1. - accuracy summary.scalar('accuracy', accuracy) summary.scalar('error', error) summary.inference.scalar('loss', loss) summary.inference.scalar('accuracy', accuracy) summary.inference.scalar('error', error)
def testAdvancedIndexing(self): a = O.placeholder('a', shape=(5, 5)) a_val = np.arange(25).reshape((5, 5)).astype('float32') feed_dict = {a.name: a_val} self.assertTensorClose(a[0:3].eval(feed_dict=feed_dict), a_val[0:3]) self.assertTensorClose(a[0:3, 0:3].eval(feed_dict=feed_dict), a_val[0:3, 0:3]) with self.assertRaises(NotImplementedError): self.assertTensorClose(a.set_sub[0:3](1).eval(feed_dict=feed_dict), np.array([1, 1, 1, 3, 4])) if True: self.assertTensorClose(a.ai[[0, 3]].eval(feed_dict=feed_dict), a_val[[0, 3]]) self.assertTensorClose( a.ai[[0, 3], [0, 3]].eval(feed_dict=feed_dict), a_val[[0, 3], [0, 3]]) with self.assertRaises(NotImplementedError): self.assertTensorClose( a.set_ai[[0, 3]](1).eval(feed_dict=feed_dict), np.array([1, 1, 1, 3, 4]))
def make_network(env): is_train = env.phase is env.Phase.TRAIN with env.create_network() as net: h, w, c = get_input_shape() dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, h, w, c)) next_state = O.placeholder('next_state', shape=(None, h, w, c)) return [state, next_state] @O.auto_reuse def phi(x): _ = x / 255.0 # Nature structure with O.argscope(O.conv2d, nonlin=O.relu): _ = O.conv2d('conv1', _, 32, 8, stride=4) _ = O.conv2d('conv2', _, 64, 4, stride=2) _ = O.conv2d('conv3', _, 64, 3, stride=1) return _ def forward(state, next_state): dpc.add_output(phi(state), name='feature') dpc.add_output(phi(next_state), name='next_feature') dpc.set_input_maker(inputs).set_forward_func(forward) @O.auto_reuse def phi_fc(feature): _ = feature _ = O.fc('fc0', _, 512, nonlin=functools.partial(O.leaky_relu, alpha=0.01)) q_pred = O.fc('fcq', _, get_player_nr_actions()) q_max = q_pred.max(axis=1) q_argmax = q_pred.argmax(axis=1) return q_pred, q_max, q_argmax _ = dpc.outputs['feature'] q_pred, q_max, q_argmax = phi_fc(_) _ = dpc.outputs['next_feature'] next_q_pred, next_q_max, _ = phi_fc(_) net.add_output(q_pred, name='q_pred') net.add_output(q_max, name='q_max') net.add_output(q_argmax, name='q_argmax') if is_train: reward = O.placeholder('reward', shape=(None, ), dtype='float32') action = O.placeholder('action', shape=(None, ), dtype='int64') is_over = O.placeholder('is_over', shape=(None, ), dtype='bool') assert get_env('dqn.nr_td_steps') == 1 this_q_pred = (q_pred * O.one_hot(action, get_player_nr_actions())).sum( axis=1) this_q_label = reward + get_env('dqn.gamma') * ( 1 - is_over.astype('float32')) * O.zero_grad(next_q_max) summary.scalar('this_q_pred', this_q_pred.mean()) summary.scalar('this_q_label', this_q_label.mean()) summary.scalar('reward', reward.mean()) summary.scalar('is_over', is_over.astype('float32').mean()) q_loss = O.raw_smooth_l1_loss('raw_q_loss', this_q_pred, this_q_label).mean(name='q_loss') net.set_loss(q_loss)
def inputs(): img = O.placeholder('img', shape=(None, h, w, c)) # only for demo-time zc = O.placeholder('zc', shape=(1, net.zc_distrib.sample_size)) return [img, zc]
def make_network(env): with env.create_network() as net: net.dist = O.distrib.GaussianDistribution('policy', size=get_action_shape()[0], fixed_std=False) state = O.placeholder('state', shape=(None, ) + get_input_shape()) batch_size = state.shape[0] # We have to define variable scope here for later optimization. with env.variable_scope('policy'): _ = state _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh) logstd = O.variable('logstd', O.truncated_normal_initializer(stddev=0.01), shape=(net.dist.sample_size, ), trainable=True) logstd = O.tile(logstd.add_axis(0), [batch_size, 1]) theta = O.concat([mu, logstd], axis=1) policy = net.dist.sample(batch_size=batch_size, theta=theta, process_theta=True) policy = O.clip_by_value(policy, -1, 1) net.add_output(theta, name='theta') net.add_output(policy, name='policy') if env.phase == env.Phase.TRAIN: theta_old = O.placeholder('theta_old', shape=(None, net.dist.param_size)) action = O.placeholder('action', shape=(None, net.dist.sample_size)) advantage = O.placeholder('advantage', shape=(None, )) entropy_beta = O.scalar('entropy_beta', g.entropy_beta) log_prob = net.dist.log_likelihood(action, theta, process_theta=True) log_prob_old = net.dist.log_likelihood(action, theta_old, process_theta=True) ratio = O.exp(log_prob - log_prob_old) epsilon = get_env('ppo.epsilon') surr1 = ratio * advantage # surrogate from conservative policy iteration surr2 = O.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon) * advantage policy_loss = -O.reduce_mean(O.min( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) entropy = net.dist.entropy(theta, process_theta=True).mean() entropy_loss = -entropy_beta * entropy net.add_output(policy_loss, name='policy_loss') net.add_output(entropy_loss, name='entropy_loss') summary.scalar('policy_entropy', entropy) with env.variable_scope('value'): _ = state _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) value = O.fc('fcv', _, 1) value = value.remove_axis(1) net.add_output(value, name='value') if env.phase == env.Phase.TRAIN: value_label = O.placeholder('value_label', shape=(None, )) value_old = O.placeholder('value_old', shape=(None, )) value_surr1 = O.raw_l2_loss('raw_value_loss_surr1', value, value_label) value_clipped = value_old + O.clip_by_value( value - value_old, -epsilon, epsilon) value_surr2 = O.raw_l2_loss('raw_value_loss_surr2', value_clipped, value_label) value_loss = O.reduce_mean(O.max(value_surr1, value_surr2)) net.add_output(value_loss, name='value_loss') if env.phase == env.Phase.TRAIN: loss = O.identity(policy_loss + entropy_loss + value_loss, name='total_loss') net.set_loss(loss)
def make_network(env): is_train = env.phase is env.Phase.TRAIN # device control: always use master device only for training session if is_train: slave_devices = env.slave_devices env.set_slave_devices([]) with env.create_network() as net: input_length, = get_input_shape() action_length, = get_action_shape() dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, input_length)) return [state] # forward policy network and value network separately (actor-critic) def forward(x): _ = x _ = O.fc('fcp1', _, 512, nonlin=O.relu) _ = O.fc('fcp2', _, 256, nonlin=O.relu) dpc.add_output(_, name='feature_p') _ = x _ = O.fc('fcv1', _, 512, nonlin=O.relu) _ = O.fc('fcv2', _, 256, nonlin=O.relu) dpc.add_output(_, name='feature_v') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature_p'] # mu and std, assuming spherical covariance policy_mu = O.fc('fc_policy_mu', _, action_length) # In this example, we do not use variance. instead, we use fixed value. # policy_var = O.fc('fc_policy_var', _, 1, nonlin=O.softplus) # policy_var = O.tile(policy_var, [1, action_length], name='policy_var') # policy_std = O.sqrt(policy_var, name='policy_std') actor_space = get_env('a3c.actor_space') nr_bins = actor_space.shape[1] # Instead of using normal distribution, we use Laplacian distribution for policy. # And also, we are sampling from a truncated Laplacian distribution (only care the value in the # action space). To simplify the computation, we discretize the action space. actor_space = O.constant(actor_space) actor_space = O.tile(actor_space.add_axis(0), [policy_mu.shape[0], 1, 1]) policy_mu3 = O.tile(policy_mu.add_axis(2), [1, 1, nr_bins]) # policy_std3 = O.tile(policy_std.add_axis(2), [1, 1, nr_bins]) # logits = O.abs(actor_space - policy_mu3) / (policy_std3 + 1e-2) # Here, we force the std of the policy to be 1. logits_explore = -O.abs(actor_space - policy_mu3) policy_explore = O.softmax(logits_explore) # Clip the policy for output action_range = get_action_range() action_range = tuple(map(O.constant, action_range)) action_range = tuple(map(lambda x: O.tile(x.add_axis(0), [policy_mu.shape[0], 1]), action_range)) policy_output = O.clip_by_value(policy_mu, *action_range) _ = dpc.outputs['feature_v'] value = O.fc('fc_value', _, 1) value = value.remove_axis(1, name='value') # Note that, here the policy_explore is a discrete policy, # and policy is actually the continuous one. net.add_output(policy_explore, name='policy_explore') net.add_output(policy_output, name='policy') net.add_output(value, name='value') if is_train: action = O.placeholder('action', shape=(None, action_length), dtype='int64') future_reward = O.placeholder('future_reward', shape=(None, )) entropy_beta = O.scalar('entropy_beta', 0.1, trainable=False) # Since we discretized the action space, use cross entropy here. log_policy = O.log(policy_explore + 1e-4) log_pi_a_given_s = (log_policy * O.one_hot(action, nr_bins)).sum(axis=2).sum(axis=1) advantage = (future_reward - O.zero_grad(value)).rename('advantage') # Important trick: using only positive advantage to perform gradient assent. This stabilizes the training. advantage = advantage * O.zero_grad((advantage > 0.).astype('float32')) policy_loss = O.identity(-(log_pi_a_given_s * advantage).mean(), name='policy_loss') # As mentioned, there is no trainable variance. # entropy_loss = O.identity(-entropy_beta * (policy_std ** 2.).sum(axis=1).mean(), name='entropy_loss') value_loss = O.raw_smooth_l1_loss('raw_value_loss', future_reward, value).mean(name='value_loss') loss = O.add_n([policy_cost, value_loss], name='loss') net.set_loss(loss) for v in [policy_cost, value_loss, value.mean(name='predict_value'), advantage.rms(name='rms_advantage'), loss]: summary.scalar(v) if is_train: env.set_slave_devices(slave_devices)
def inputs(): state = O.placeholder('state', shape=(None, input_length)) return [state]
def make_network(env): with env.create_network() as net: n = 2 nr_classes = get_env('dataset.nr_classes') conv2d = functools.partial(O.conv2d, kernel=3, use_bias=False, padding='SAME') conv_bn_relu = functools.partial(conv2d, nonlin=O.bn_relu) dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): h, w, c = 32, 32, 3 img = O.placeholder('img', shape=(None, h, w, c)) return [img] def residual(name, x, first=False, inc_dim=False): in_channel = x.static_shape[3] out_channel = in_channel stride = 1 if inc_dim: out_channel = in_channel * 2 stride = 2 with env.variable_scope(name): _ = x if first else O.bn_relu(x) _ = conv_bn_relu('conv1', _, out_channel, stride=stride) _ = conv2d('conv2', _, out_channel) if inc_dim: x = O.pooling2d('pool', x, kernel=2) x = O.pad(x, [[0, 0], [0, 0], [0, 0], [in_channel // 2, in_channel // 2]]) print(name, x.static_shape) _ = _ + x return _ def forward(img): _ = img / 128.0 - 1.0 _ = conv_bn_relu('conv0', _, 16) _ = residual('res1.0', _, first=True) for i in range(1, n): _ = residual('res1.{}'.format(i), _) _ = residual('res2.0', _, inc_dim=True) for i in range(1, n): _ = residual('res2.{}'.format(i), _) _ = residual('res3.0', _, inc_dim=True) for i in range(1, n): _ = residual('res3.{}'.format(i), _) _ = O.batch_norm('bn_last', _) _ = O.relu(_) _ = _.mean(axis=[1, 2]) # global avg pool dpc.add_output(_, name='feature') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature'] _ = O.fc('linear', _, nr_classes) prob = O.softmax(_, name='prob') pred = _.argmax(axis=1).astype('int32', name='pred') net.add_output(prob) net.add_output(pred) if env.phase is env.Phase.TRAIN: label = O.placeholder('label', shape=(None, ), dtype='int32') loss = O.sparse_softmax_cross_entropy_with_logits( logits=_, labels=label).mean() loss = O.identity(loss, name='loss') net.set_loss(loss) accuracy = O.eq(label, pred).astype('float32').mean() error = 1. - accuracy summary.scalar('accuracy', accuracy) summary.scalar('error', error) summary.inference.scalar('loss', loss) summary.inference.scalar('accuracy', accuracy) summary.inference.scalar('error', error)
def inputs(): img = O.placeholder('img', shape=(None, h, w, c)) if env.phase is env.Phase.TRAIN: return [img] else: return []
def make_network(env): is_train = env.phase is env.Phase.TRAIN if is_train: slave_devices = env.slave_devices env.set_slave_devices([]) with env.create_network() as net: h, w, c = get_input_shape() dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, h, w, c)) return [state] def forward(x): _ = x / 255.0 with O.argscope(O.conv2d, nonlin=O.relu): _ = O.conv2d('conv0', _, 32, 5) _ = O.max_pooling2d('pool0', _, 2) _ = O.conv2d('conv1', _, 32, 5) _ = O.max_pooling2d('pool1', _, 2) _ = O.conv2d('conv2', _, 64, 4) _ = O.max_pooling2d('pool2', _, 2) _ = O.conv2d('conv3', _, 64, 3) dpc.add_output(_, name='feature') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature'] _ = O.fc('fc0', _, 512, nonlin=O.p_relu) policy = O.fc('fc_policy', _, get_player_nr_actions()) value = O.fc('fc_value', _, 1) expf = O.scalar('explore_factor', 1, trainable=False) policy_explore = O.softmax(policy * expf, name='policy_explore') policy = O.softmax(policy, name='policy') value = value.remove_axis(1, name='value') net.add_output(policy_explore, name='policy_explore') net.add_output(policy, name='policy') net.add_output(value, name='value') if is_train: action = O.placeholder('action', shape=(None, ), dtype='int64') future_reward = O.placeholder('future_reward', shape=(None, )) log_policy = O.log(policy + 1e-6) log_pi_a_given_s = ( log_policy * O.one_hot(action, get_player_nr_actions())).sum(axis=1) advantage = (future_reward - O.zero_grad(value)).rename('advantage') policy_cost = (log_pi_a_given_s * advantage).mean(name='policy_cost') xentropy_cost = (-policy * log_policy).sum(axis=1).mean(name='xentropy_cost') value_loss = O.raw_l2_loss('raw_value_loss', future_reward, value).mean(name='value_loss') entropy_beta = O.scalar('entropy_beta', 0.01, trainable=False) loss = O.add_n( [-policy_cost, -xentropy_cost * entropy_beta, value_loss], name='loss') net.set_loss(loss) for v in [ policy_cost, xentropy_cost, value_loss, value.mean(name='predict_value'), advantage.rms(name='rms_advantage'), loss ]: summary.scalar(v) if is_train: env.set_slave_devices(slave_devices)
def inputs(): img = O.placeholder('img', shape=(None, h, w, c)) return [img]
def make_network(env): with env.create_network() as net: state = O.placeholder('state', shape=(None, ) + get_input_shape()) logits = O.fc('fc', state, get_action_shape()) net.add_output(logits, name='policy')
def make_rpredictor_network(env): is_train = env.phase is env.Phase.TRAIN with env.create_network() as net: h, w, c = get_input_shape() # Hack(MJY):: forced RGB input (instead of combination of history frames) c = 3 dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, h, w, c)) t1_state = O.placeholder('t1_state', shape=(None, h, w, c)) t2_state = O.placeholder('t2_state', shape=(None, h, w, c)) return [state, t1_state, t2_state] @O.auto_reuse def forward_conv(x): _ = x / 255.0 with O.argscope(O.conv2d, nonlin=O.relu): _ = O.conv2d('conv0', _, 32, 5) _ = O.max_pooling2d('pool0', _, 2) _ = O.conv2d('conv1', _, 32, 5) _ = O.max_pooling2d('pool1', _, 2) _ = O.conv2d('conv2', _, 64, 4) _ = O.max_pooling2d('pool2', _, 2) _ = O.conv2d('conv3', _, 64, 3) return _ def forward(x, t1, t2): dpc.add_output(forward_conv(x), name='feature') dpc.add_output(forward_conv(t1), name='t1_feature') dpc.add_output(forward_conv(t2), name='t2_feature') dpc.set_input_maker(inputs).set_forward_func(forward) @O.auto_reuse def forward_fc(feature, action): action = O.one_hot(action, get_player_nr_actions()) _ = O.concat([feature.flatten2(), action], axis=1) _ = O.fc('fc0', _, 512, nonlin=O.p_relu) reward = O.fc('fc_reward', _, 1) return reward action = O.placeholder('action', shape=(None, ), dtype='int64') net.add_output(forward_fc(dpc.outputs['feature'], action), name='reward') if is_train: t1_action = O.placeholder('t1_action', shape=(None, ), dtype='int64') t1_reward_exp = O.exp( forward_fc(dpc.outputs['t1_feature'], t1_action).sum()) t2_action = O.placeholder('t2_action', shape=(None, ), dtype='int64') t2_reward_exp = O.exp( forward_fc(dpc.outputs['t2_feature'], t2_action).sum()) pref = O.placeholder('pref') pref = O.callback_injector(pref) p1, p2 = 1 - pref, pref p_greater = t1_reward_exp / (t1_reward_exp + t2_reward_exp) loss = -p1 * O.log(p_greater) - p2 * O.log(1 - p_greater) net.set_loss(loss)
def make_network(env): use_linear_vr = get_env('trpo.use_linear_vr') with env.create_network() as net: net.dist = O.distrib.GaussianDistribution('policy', size=get_action_shape()[0], fixed_std=False) if use_linear_vr: from tartist.app.rl.utils.math import LinearValueRegressor net.value_regressor = LinearValueRegressor() state = O.placeholder('state', shape=(None, ) + get_input_shape()) # state = O.moving_average(state) # state = O.clip_by_value(state, -10, 10) batch_size = state.shape[0] # We have to define variable scope here for later optimization. with env.variable_scope('policy'): _ = state with O.argscope(O.fc): _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh) logstd = O.variable( 'logstd', O.truncated_normal_initializer(stddev=0.01), shape=(net.dist.sample_size, ), trainable=True) logstd = O.tile(logstd.add_axis(0), [batch_size, 1]) theta = O.concat([mu, logstd], axis=1) policy = net.dist.sample(batch_size=batch_size, theta=theta, process_theta=True) policy = O.clip_by_value(policy, -1, 1) net.add_output(theta, name='theta') net.add_output(policy, name='policy') if env.phase == env.Phase.TRAIN: theta_old = O.placeholder('theta_old', shape=(None, net.dist.param_size)) action = O.placeholder('action', shape=(None, net.dist.sample_size)) advantage = O.placeholder('advantage', shape=(None, )) log_prob = net.dist.log_likelihood(action, theta, process_theta=True) log_prob_old = net.dist.log_likelihood(action, theta_old, process_theta=True) # Importance sampling of surrogate loss (L in paper). ratio = O.exp(log_prob - log_prob_old) policy_loss = -O.reduce_mean(ratio * advantage) kl = net.dist.kl(theta_p=theta_old, theta_q=theta, process_theta=True).mean() kl_self = net.dist.kl(theta_p=O.zero_grad(theta), theta_q=theta, process_theta=True).mean() entropy = net.dist.entropy(theta, process_theta=True).mean() net.add_output(policy_loss, name='policy_loss') net.add_output(kl, name='kl') net.add_output(kl_self, name='kl_self') summary.scalar('policy_entropy', entropy, collections=[rl.train.ACGraphKeys.POLICY_SUMMARIES]) if not use_linear_vr: with env.variable_scope('value'): value = O.fc('fcv', state, 1) net.add_output(value, name='value') if env.phase == env.Phase.TRAIN: value_label = O.placeholder('value_label', shape=(None, )) value_loss = O.raw_l2_loss('raw_value_loss', value, value_label).mean(name='value_loss') net.add_output(value_loss, name='value_loss')
def inputs(): state = O.placeholder('state', shape=(None, h, w, c)) return [state]
def inputs(): img_a = O.placeholder('img_a', shape=(None, h, w, c)) img_b = O.placeholder('img_b', shape=(None, h, w, c)) return [img_a, img_b]