def encoder(x, nonlin): w_init = O.truncated_normal_initializer(stddev=0.02) with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\ O.argscope(O.leaky_relu, alpha=0.2): _ = x _ = O.conv2d('conv1', _, 64, nonlin=O.leaky_relu) _ = O.conv2d('conv2', _, 128, nonlin=nonlin, use_bias=False) _ = O.conv2d('conv3', _, 256, nonlin=nonlin, use_bias=False) _ = O.conv2d('conv4', _, 512, nonlin=nonlin, use_bias=False) z = _ return z
def generator(z): w_init = O.truncated_normal_initializer(stddev=0.02) with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\ O.argscope(O.fc, W=w_init): _ = z _ = O.fc('fc1', _, 1024, nonlin=O.bn_relu) _ = O.fc('fc2', _, 128 * 7 * 7, nonlin=O.bn_relu) _ = O.reshape(_, [-1, 7, 7, 128]) _ = O.deconv2d('deconv1', _, 64, nonlin=O.bn_relu) _ = O.deconv2d('deconv2', _, 1) _ = O.sigmoid(_, 'out') return _
def decoder(z): w_init = O.truncated_normal_initializer(stddev=0.02) with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\ O.argscope(O.fc, W=w_init): _ = z _ = O.deconv2d('deconv1', _, 256, nonlin=O.bn_relu) _ = O.deconv2d('deconv2', _, 128, nonlin=O.bn_relu) _ = O.deconv2d('deconv3', _, 64, nonlin=O.bn_relu) _ = O.deconv2d('deconv4', _, c) _ = O.sigmoid(_, name='out') x = _ return x
def discriminator(img): w_init = O.truncated_normal_initializer(stddev=0.02) with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\ O.argscope(O.fc, W=w_init),\ O.argscope(O.leaky_relu, alpha=0.2): _ = img _ = O.conv2d('conv1', _, 64, nonlin=O.leaky_relu) _ = O.conv2d('conv2', _, 128, nonlin=O.bn_nonlin) _ = O.leaky_relu(_) _ = O.fc('fc1', _, 1024, nonlin=O.bn_nonlin) _ = O.leaky_relu(_) _ = O.fc('fct', _, 1) return _
def discriminator(img): w_init = O.truncated_normal_initializer(stddev=0.02) with O.argscope(O.conv2d, O.deconv2d, kernel=4, stride=2, W=w_init),\ O.argscope(O.fc, W=w_init),\ O.argscope(O.leaky_relu, alpha=0.2): _ = img _ = O.conv2d('conv1', _, 64, nonlin=O.leaky_relu) _ = O.conv2d('conv2', _, 128, nonlin=O.bn_nonlin) _ = O.leaky_relu(_) _ = O.fc('fc1', _, 1024, nonlin=O.bn_nonlin) _ = O.leaky_relu(_) with env.variable_scope('score'): logits = O.fc('fct', _, 1) with env.variable_scope('code'): _ = O.fc('fc1', _, 128, nonlin=O.bn_nonlin) _ = O.leaky_relu(_) code = O.fc('fc2', _, zc_distrib.param_size) return logits, code
def make_network(env): with env.create_network() as net: net.dist = O.distrib.GaussianDistribution('policy', size=get_action_shape()[0], fixed_std=False) state = O.placeholder('state', shape=(None, ) + get_input_shape()) batch_size = state.shape[0] # We have to define variable scope here for later optimization. with env.variable_scope('policy'): _ = state _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh) logstd = O.variable('logstd', O.truncated_normal_initializer(stddev=0.01), shape=(net.dist.sample_size, ), trainable=True) logstd = O.tile(logstd.add_axis(0), [batch_size, 1]) theta = O.concat([mu, logstd], axis=1) policy = net.dist.sample(batch_size=batch_size, theta=theta, process_theta=True) policy = O.clip_by_value(policy, -1, 1) net.add_output(theta, name='theta') net.add_output(policy, name='policy') if env.phase == env.Phase.TRAIN: theta_old = O.placeholder('theta_old', shape=(None, net.dist.param_size)) action = O.placeholder('action', shape=(None, net.dist.sample_size)) advantage = O.placeholder('advantage', shape=(None, )) entropy_beta = O.scalar('entropy_beta', g.entropy_beta) log_prob = net.dist.log_likelihood(action, theta, process_theta=True) log_prob_old = net.dist.log_likelihood(action, theta_old, process_theta=True) ratio = O.exp(log_prob - log_prob_old) epsilon = get_env('ppo.epsilon') surr1 = ratio * advantage # surrogate from conservative policy iteration surr2 = O.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon) * advantage policy_loss = -O.reduce_mean(O.min( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) entropy = net.dist.entropy(theta, process_theta=True).mean() entropy_loss = -entropy_beta * entropy net.add_output(policy_loss, name='policy_loss') net.add_output(entropy_loss, name='entropy_loss') summary.scalar('policy_entropy', entropy) with env.variable_scope('value'): _ = state _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) value = O.fc('fcv', _, 1) value = value.remove_axis(1) net.add_output(value, name='value') if env.phase == env.Phase.TRAIN: value_label = O.placeholder('value_label', shape=(None, )) value_old = O.placeholder('value_old', shape=(None, )) value_surr1 = O.raw_l2_loss('raw_value_loss_surr1', value, value_label) value_clipped = value_old + O.clip_by_value( value - value_old, -epsilon, epsilon) value_surr2 = O.raw_l2_loss('raw_value_loss_surr2', value_clipped, value_label) value_loss = O.reduce_mean(O.max(value_surr1, value_surr2)) net.add_output(value_loss, name='value_loss') if env.phase == env.Phase.TRAIN: loss = O.identity(policy_loss + entropy_loss + value_loss, name='total_loss') net.set_loss(loss)
def make_network(env): use_linear_vr = get_env('trpo.use_linear_vr') with env.create_network() as net: net.dist = O.distrib.GaussianDistribution('policy', size=get_action_shape()[0], fixed_std=False) if use_linear_vr: from tartist.app.rl.utils.math import LinearValueRegressor net.value_regressor = LinearValueRegressor() state = O.placeholder('state', shape=(None, ) + get_input_shape()) # state = O.moving_average(state) # state = O.clip_by_value(state, -10, 10) batch_size = state.shape[0] # We have to define variable scope here for later optimization. with env.variable_scope('policy'): _ = state with O.argscope(O.fc): _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh) logstd = O.variable( 'logstd', O.truncated_normal_initializer(stddev=0.01), shape=(net.dist.sample_size, ), trainable=True) logstd = O.tile(logstd.add_axis(0), [batch_size, 1]) theta = O.concat([mu, logstd], axis=1) policy = net.dist.sample(batch_size=batch_size, theta=theta, process_theta=True) policy = O.clip_by_value(policy, -1, 1) net.add_output(theta, name='theta') net.add_output(policy, name='policy') if env.phase == env.Phase.TRAIN: theta_old = O.placeholder('theta_old', shape=(None, net.dist.param_size)) action = O.placeholder('action', shape=(None, net.dist.sample_size)) advantage = O.placeholder('advantage', shape=(None, )) log_prob = net.dist.log_likelihood(action, theta, process_theta=True) log_prob_old = net.dist.log_likelihood(action, theta_old, process_theta=True) # Importance sampling of surrogate loss (L in paper). ratio = O.exp(log_prob - log_prob_old) policy_loss = -O.reduce_mean(ratio * advantage) kl = net.dist.kl(theta_p=theta_old, theta_q=theta, process_theta=True).mean() kl_self = net.dist.kl(theta_p=O.zero_grad(theta), theta_q=theta, process_theta=True).mean() entropy = net.dist.entropy(theta, process_theta=True).mean() net.add_output(policy_loss, name='policy_loss') net.add_output(kl, name='kl') net.add_output(kl_self, name='kl_self') summary.scalar('policy_entropy', entropy, collections=[rl.train.ACGraphKeys.POLICY_SUMMARIES]) if not use_linear_vr: with env.variable_scope('value'): value = O.fc('fcv', state, 1) net.add_output(value, name='value') if env.phase == env.Phase.TRAIN: value_label = O.placeholder('value_label', shape=(None, )) value_loss = O.raw_l2_loss('raw_value_loss', value, value_label).mean(name='value_loss') net.add_output(value_loss, name='value_loss')