def make_rpredictor_network(env): is_train = env.phase is env.Phase.TRAIN with env.create_network() as net: h, w, c = get_input_shape() # Hack(MJY):: forced RGB input (instead of combination of history frames) c = 3 dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, h, w, c)) t1_state = O.placeholder('t1_state', shape=(None, h, w, c)) t2_state = O.placeholder('t2_state', shape=(None, h, w, c)) return [state, t1_state, t2_state] @O.auto_reuse def forward_conv(x): _ = x / 255.0 with O.argscope(O.conv2d, nonlin=O.relu): _ = O.conv2d('conv0', _, 32, 5) _ = O.max_pooling2d('pool0', _, 2) _ = O.conv2d('conv1', _, 32, 5) _ = O.max_pooling2d('pool1', _, 2) _ = O.conv2d('conv2', _, 64, 4) _ = O.max_pooling2d('pool2', _, 2) _ = O.conv2d('conv3', _, 64, 3) return _ def forward(x, t1, t2): dpc.add_output(forward_conv(x), name='feature') dpc.add_output(forward_conv(t1), name='t1_feature') dpc.add_output(forward_conv(t2), name='t2_feature') dpc.set_input_maker(inputs).set_forward_func(forward) @O.auto_reuse def forward_fc(feature, action): action = O.one_hot(action, get_player_nr_actions()) _ = O.concat([feature.flatten2(), action], axis=1) _ = O.fc('fc0', _, 512, nonlin=O.p_relu) reward = O.fc('fc_reward', _, 1) return reward action = O.placeholder('action', shape=(None, ), dtype='int64') net.add_output(forward_fc(dpc.outputs['feature'], action), name='reward') if is_train: t1_action = O.placeholder('t1_action', shape=(None, ), dtype='int64') t1_reward_exp = O.exp( forward_fc(dpc.outputs['t1_feature'], t1_action).sum()) t2_action = O.placeholder('t2_action', shape=(None, ), dtype='int64') t2_reward_exp = O.exp( forward_fc(dpc.outputs['t2_feature'], t2_action).sum()) pref = O.placeholder('pref') pref = O.callback_injector(pref) p1, p2 = 1 - pref, pref p_greater = t1_reward_exp / (t1_reward_exp + t2_reward_exp) loss = -p1 * O.log(p_greater) - p2 * O.log(1 - p_greater) net.set_loss(loss)
def make_network(env): is_train = env.phase is env.Phase.TRAIN if is_train: slave_devices = env.slave_devices env.set_slave_devices([]) with env.create_network() as net: h, w, c = get_input_shape() dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, h, w, c)) return [state] def forward(x): _ = x / 255.0 with O.argscope(O.conv2d, nonlin=O.relu): _ = O.conv2d('conv0', _, 32, 5) _ = O.max_pooling2d('pool0', _, 2) _ = O.conv2d('conv1', _, 32, 5) _ = O.max_pooling2d('pool1', _, 2) _ = O.conv2d('conv2', _, 64, 4) _ = O.max_pooling2d('pool2', _, 2) _ = O.conv2d('conv3', _, 64, 3) dpc.add_output(_, name='feature') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature'] _ = O.fc('fc0', _, 512, nonlin=O.p_relu) policy = O.fc('fc_policy', _, get_player_nr_actions()) value = O.fc('fc_value', _, 1) expf = O.scalar('explore_factor', 1, trainable=False) policy_explore = O.softmax(policy * expf, name='policy_explore') policy = O.softmax(policy, name='policy') value = value.remove_axis(1, name='value') net.add_output(policy_explore, name='policy_explore') net.add_output(policy, name='policy') net.add_output(value, name='value') if is_train: action = O.placeholder('action', shape=(None, ), dtype='int64') future_reward = O.placeholder('future_reward', shape=(None, )) log_policy = O.log(policy + 1e-6) log_pi_a_given_s = ( log_policy * O.one_hot(action, get_player_nr_actions())).sum(axis=1) advantage = (future_reward - O.zero_grad(value)).rename('advantage') policy_cost = (log_pi_a_given_s * advantage).mean(name='policy_cost') xentropy_cost = (-policy * log_policy).sum(axis=1).mean(name='xentropy_cost') value_loss = O.raw_l2_loss('raw_value_loss', future_reward, value).mean(name='value_loss') entropy_beta = O.scalar('entropy_beta', 0.01, trainable=False) loss = O.add_n( [-policy_cost, -xentropy_cost * entropy_beta, value_loss], name='loss') net.set_loss(loss) for v in [ policy_cost, xentropy_cost, value_loss, value.mean(name='predict_value'), advantage.rms(name='rms_advantage'), loss ]: summary.scalar(v) if is_train: env.set_slave_devices(slave_devices)
def make_network(env): is_train = env.phase is env.Phase.TRAIN # device control: always use master device only for training session if is_train: slave_devices = env.slave_devices env.set_slave_devices([]) with env.create_network() as net: input_length, = get_input_shape() action_length, = get_action_shape() dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, input_length)) return [state] # forward policy network and value network separately (actor-critic) def forward(x): _ = x _ = O.fc('fcp1', _, 512, nonlin=O.relu) _ = O.fc('fcp2', _, 256, nonlin=O.relu) dpc.add_output(_, name='feature_p') _ = x _ = O.fc('fcv1', _, 512, nonlin=O.relu) _ = O.fc('fcv2', _, 256, nonlin=O.relu) dpc.add_output(_, name='feature_v') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature_p'] # mu and std, assuming spherical covariance policy_mu = O.fc('fc_policy_mu', _, action_length) # In this example, we do not use variance. instead, we use fixed value. # policy_var = O.fc('fc_policy_var', _, 1, nonlin=O.softplus) # policy_var = O.tile(policy_var, [1, action_length], name='policy_var') # policy_std = O.sqrt(policy_var, name='policy_std') actor_space = get_env('a3c.actor_space') nr_bins = actor_space.shape[1] # Instead of using normal distribution, we use Laplacian distribution for policy. # And also, we are sampling from a truncated Laplacian distribution (only care the value in the # action space). To simplify the computation, we discretize the action space. actor_space = O.constant(actor_space) actor_space = O.tile(actor_space.add_axis(0), [policy_mu.shape[0], 1, 1]) policy_mu3 = O.tile(policy_mu.add_axis(2), [1, 1, nr_bins]) # policy_std3 = O.tile(policy_std.add_axis(2), [1, 1, nr_bins]) # logits = O.abs(actor_space - policy_mu3) / (policy_std3 + 1e-2) # Here, we force the std of the policy to be 1. logits_explore = -O.abs(actor_space - policy_mu3) policy_explore = O.softmax(logits_explore) # Clip the policy for output action_range = get_action_range() action_range = tuple(map(O.constant, action_range)) action_range = tuple(map(lambda x: O.tile(x.add_axis(0), [policy_mu.shape[0], 1]), action_range)) policy_output = O.clip_by_value(policy_mu, *action_range) _ = dpc.outputs['feature_v'] value = O.fc('fc_value', _, 1) value = value.remove_axis(1, name='value') # Note that, here the policy_explore is a discrete policy, # and policy is actually the continuous one. net.add_output(policy_explore, name='policy_explore') net.add_output(policy_output, name='policy') net.add_output(value, name='value') if is_train: action = O.placeholder('action', shape=(None, action_length), dtype='int64') future_reward = O.placeholder('future_reward', shape=(None, )) entropy_beta = O.scalar('entropy_beta', 0.1, trainable=False) # Since we discretized the action space, use cross entropy here. log_policy = O.log(policy_explore + 1e-4) log_pi_a_given_s = (log_policy * O.one_hot(action, nr_bins)).sum(axis=2).sum(axis=1) advantage = (future_reward - O.zero_grad(value)).rename('advantage') # Important trick: using only positive advantage to perform gradient assent. This stabilizes the training. advantage = advantage * O.zero_grad((advantage > 0.).astype('float32')) policy_loss = O.identity(-(log_pi_a_given_s * advantage).mean(), name='policy_loss') # As mentioned, there is no trainable variance. # entropy_loss = O.identity(-entropy_beta * (policy_std ** 2.).sum(axis=1).mean(), name='entropy_loss') value_loss = O.raw_smooth_l1_loss('raw_value_loss', future_reward, value).mean(name='value_loss') loss = O.add_n([policy_cost, value_loss], name='loss') net.set_loss(loss) for v in [policy_cost, value_loss, value.mean(name='predict_value'), advantage.rms(name='rms_advantage'), loss]: summary.scalar(v) if is_train: env.set_slave_devices(slave_devices)