예제 #1
0
def make_network(env):
    is_train = env.phase is env.Phase.TRAIN
    if is_train:
        slave_devices = env.slave_devices
        env.set_slave_devices([])

    with env.create_network() as net:
        h, w, c = get_input_shape()

        dpc = env.create_dpcontroller()
        with dpc.activate():

            def inputs():
                state = O.placeholder('state', shape=(None, h, w, c))
                return [state]

            def forward(x):
                _ = x / 255.0
                with O.argscope(O.conv2d, nonlin=O.relu):
                    _ = O.conv2d('conv0', _, 32, 5)
                    _ = O.max_pooling2d('pool0', _, 2)
                    _ = O.conv2d('conv1', _, 32, 5)
                    _ = O.max_pooling2d('pool1', _, 2)
                    _ = O.conv2d('conv2', _, 64, 4)
                    _ = O.max_pooling2d('pool2', _, 2)
                    _ = O.conv2d('conv3', _, 64, 3)

                dpc.add_output(_, name='feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        _ = dpc.outputs['feature']
        _ = O.fc('fc0', _, 512, nonlin=O.p_relu)
        policy = O.fc('fc_policy', _, get_player_nr_actions())
        value = O.fc('fc_value', _, 1)

        expf = O.scalar('explore_factor', 1, trainable=False)
        policy_explore = O.softmax(policy * expf, name='policy_explore')

        policy = O.softmax(policy, name='policy')
        value = value.remove_axis(1, name='value')

        net.add_output(policy_explore, name='policy_explore')
        net.add_output(policy, name='policy')
        net.add_output(value, name='value')

        if is_train:
            action = O.placeholder('action', shape=(None, ), dtype='int64')
            future_reward = O.placeholder('future_reward', shape=(None, ))

            log_policy = O.log(policy + 1e-6)
            log_pi_a_given_s = (
                log_policy *
                O.one_hot(action, get_player_nr_actions())).sum(axis=1)
            advantage = (future_reward -
                         O.zero_grad(value)).rename('advantage')
            policy_cost = (log_pi_a_given_s *
                           advantage).mean(name='policy_cost')
            xentropy_cost = (-policy *
                             log_policy).sum(axis=1).mean(name='xentropy_cost')
            value_loss = O.raw_l2_loss('raw_value_loss', future_reward,
                                       value).mean(name='value_loss')
            entropy_beta = O.scalar('entropy_beta', 0.01, trainable=False)
            loss = O.add_n(
                [-policy_cost, -xentropy_cost * entropy_beta, value_loss],
                name='loss')

            net.set_loss(loss)

            for v in [
                    policy_cost, xentropy_cost, value_loss,
                    value.mean(name='predict_value'),
                    advantage.rms(name='rms_advantage'), loss
            ]:
                summary.scalar(v)

    if is_train:
        env.set_slave_devices(slave_devices)
def make_network(env):
    is_train = env.phase is env.Phase.TRAIN

    with env.create_network() as net:
        h, w, c = get_input_shape()

        dpc = env.create_dpcontroller()
        with dpc.activate():

            def inputs():
                state = O.placeholder('state', shape=(None, h, w, c))
                next_state = O.placeholder('next_state', shape=(None, h, w, c))
                return [state, next_state]

            @O.auto_reuse
            def phi(x):
                _ = x / 255.0

                # Nature structure
                with O.argscope(O.conv2d, nonlin=O.relu):
                    _ = O.conv2d('conv1', _, 32, 8, stride=4)
                    _ = O.conv2d('conv2', _, 64, 4, stride=2)
                    _ = O.conv2d('conv3', _, 64, 3, stride=1)
                return _

            def forward(state, next_state):
                dpc.add_output(phi(state), name='feature')
                dpc.add_output(phi(next_state), name='next_feature')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        @O.auto_reuse
        def phi_fc(feature):
            _ = feature
            _ = O.fc('fc0',
                     _,
                     512,
                     nonlin=functools.partial(O.leaky_relu, alpha=0.01))
            q_pred = O.fc('fcq', _, get_player_nr_actions())
            q_max = q_pred.max(axis=1)
            q_argmax = q_pred.argmax(axis=1)
            return q_pred, q_max, q_argmax

        _ = dpc.outputs['feature']
        q_pred, q_max, q_argmax = phi_fc(_)

        _ = dpc.outputs['next_feature']
        next_q_pred, next_q_max, _ = phi_fc(_)

        net.add_output(q_pred, name='q_pred')
        net.add_output(q_max, name='q_max')
        net.add_output(q_argmax, name='q_argmax')

        if is_train:
            reward = O.placeholder('reward', shape=(None, ), dtype='float32')
            action = O.placeholder('action', shape=(None, ), dtype='int64')
            is_over = O.placeholder('is_over', shape=(None, ), dtype='bool')

            assert get_env('dqn.nr_td_steps') == 1
            this_q_pred = (q_pred *
                           O.one_hot(action, get_player_nr_actions())).sum(
                               axis=1)
            this_q_label = reward + get_env('dqn.gamma') * (
                1 - is_over.astype('float32')) * O.zero_grad(next_q_max)

            summary.scalar('this_q_pred', this_q_pred.mean())
            summary.scalar('this_q_label', this_q_label.mean())
            summary.scalar('reward', reward.mean())
            summary.scalar('is_over', is_over.astype('float32').mean())

            q_loss = O.raw_smooth_l1_loss('raw_q_loss', this_q_pred,
                                          this_q_label).mean(name='q_loss')
            net.set_loss(q_loss)
예제 #3
0
 def forward_fc(feature, action):
     action = O.one_hot(action, get_player_nr_actions())
     _ = O.concat([feature.flatten2(), action], axis=1)
     _ = O.fc('fc0', _, 512, nonlin=O.p_relu)
     reward = O.fc('fc_reward', _, 1)
     return reward
예제 #4
0
def make_network(env):
    is_train = env.phase is env.Phase.TRAIN

    # device control: always use master device only for training session
    if is_train:
        slave_devices = env.slave_devices
        env.set_slave_devices([])
    
    with env.create_network() as net:
        input_length, = get_input_shape()
        action_length, = get_action_shape()

        dpc = env.create_dpcontroller()
        with dpc.activate():
            def inputs():
                state = O.placeholder('state', shape=(None, input_length))
                return [state]

            # forward policy network and value network separately (actor-critic)
            def forward(x):
                _ = x
                _ = O.fc('fcp1', _, 512, nonlin=O.relu)
                _ = O.fc('fcp2', _, 256, nonlin=O.relu)
                dpc.add_output(_, name='feature_p')

                _ = x
                _ = O.fc('fcv1', _, 512, nonlin=O.relu)
                _ = O.fc('fcv2', _, 256, nonlin=O.relu)
                dpc.add_output(_, name='feature_v')

            dpc.set_input_maker(inputs).set_forward_func(forward)

        _ = dpc.outputs['feature_p']
        # mu and std, assuming spherical covariance
        policy_mu = O.fc('fc_policy_mu', _, action_length)

        # In this example, we do not use variance. instead, we use fixed value.
        # policy_var = O.fc('fc_policy_var', _, 1, nonlin=O.softplus)
        # policy_var = O.tile(policy_var, [1, action_length], name='policy_var')
        # policy_std = O.sqrt(policy_var, name='policy_std')

        actor_space = get_env('a3c.actor_space')
        nr_bins = actor_space.shape[1]

        # Instead of using normal distribution, we use Laplacian distribution for policy.
        # And also, we are sampling from a truncated Laplacian distribution (only care the value in the
        # action space). To simplify the computation, we discretize the action space.
        actor_space = O.constant(actor_space)
        actor_space = O.tile(actor_space.add_axis(0), [policy_mu.shape[0], 1, 1])
        policy_mu3 = O.tile(policy_mu.add_axis(2), [1, 1, nr_bins])

        # policy_std3 = O.tile(policy_std.add_axis(2), [1, 1, nr_bins])
        # logits = O.abs(actor_space - policy_mu3) / (policy_std3 + 1e-2)

        # Here, we force the std of the policy to be 1.
        logits_explore = -O.abs(actor_space - policy_mu3)
        policy_explore = O.softmax(logits_explore)

        # Clip the policy for output
        action_range = get_action_range()
        action_range = tuple(map(O.constant, action_range))
        action_range = tuple(map(lambda x: O.tile(x.add_axis(0), [policy_mu.shape[0], 1]), action_range))
        policy_output = O.clip_by_value(policy_mu, *action_range)

        _ = dpc.outputs['feature_v']
        value = O.fc('fc_value', _, 1)
        value = value.remove_axis(1, name='value')

        # Note that, here the policy_explore is a discrete policy,
        # and policy is actually the continuous one.
        net.add_output(policy_explore, name='policy_explore')
        net.add_output(policy_output, name='policy')
        net.add_output(value, name='value')

        if is_train:
            action = O.placeholder('action', shape=(None, action_length), dtype='int64')
            future_reward = O.placeholder('future_reward', shape=(None, ))
            entropy_beta = O.scalar('entropy_beta', 0.1, trainable=False)

            # Since we discretized the action space, use cross entropy here.
            log_policy = O.log(policy_explore + 1e-4)
            log_pi_a_given_s = (log_policy * O.one_hot(action, nr_bins)).sum(axis=2).sum(axis=1)
            advantage = (future_reward - O.zero_grad(value)).rename('advantage')

            # Important trick: using only positive advantage to perform gradient assent. This stabilizes the training.
            advantage = advantage * O.zero_grad((advantage > 0.).astype('float32'))
            policy_loss = O.identity(-(log_pi_a_given_s * advantage).mean(), name='policy_loss')

            # As mentioned, there is no trainable variance.
            # entropy_loss = O.identity(-entropy_beta * (policy_std ** 2.).sum(axis=1).mean(), name='entropy_loss')

            value_loss = O.raw_smooth_l1_loss('raw_value_loss', future_reward, value).mean(name='value_loss')

            loss = O.add_n([policy_cost, value_loss], name='loss')

            net.set_loss(loss)

            for v in [policy_cost, value_loss,
                      value.mean(name='predict_value'), advantage.rms(name='rms_advantage'), loss]:
                summary.scalar(v)

    if is_train:
        env.set_slave_devices(slave_devices)