示例#1
0
文件: op.py 项目: yongyi-wu/gluon-nlp
def gumbel_softmax(logits, temperature: float = 1.0, eps: float = 1E-10,
                   hard=True, use_np_gumbel: bool = True):
    r"""Perform the gumbel-softmax trick to generate differentiable one-hot vectors from the input
    logits.

    Here, the gumbel distribution is

    Gumbel(\alpha) = -log (-log U) + \log \alpha, in which U is the uniform(0, 1) distribution.

    A nice property of Gumbel is:

    \argmax({Gumbel(\alpha_i)}) \sim multinomial(\alpha_i)

    The Gumbel-Softmax trick is to use the softmax + straight-through estimator to produce
    one-hot vectors that represent the sampling result.

    References:

        1. https://en.wikipedia.org/wiki/Gumbel_distribution
        2. [ICLR2017] Categorical Reparameterization with Gumbel-Softmax

    Parameters
    ----------
    logits
        Logits. Shape (..., V)
    temperature
        The temperature that controls the
    eps
        The eps for stability of gradient
    hard
        Whether to use the straight-through estimator to produce one-hot vectors.
    use_np_gumbel
        Whether to use the random.gumble operator

    Returns
    -------
    ret
        The returned output. Shape (..., V)
    """
    # TODO(sxjscience) Investigate the impact of random.gumbel:
    #  Actually, random.gumble has no eps and may have problem in calculating the gradient.
    if use_np_gumbel:
        gumbels = np.random.gumbel(np.zeros_like(logits))
    else:
        u = np.random.uniform(np.zeros_like(logits), 1)
        gumbels = -np.log(-np.log(u + eps) + eps)
    y = npx.softmax((gumbels + logits) / temperature, axis=-1)
    if hard:
        y_hard = np.max(y, axis=-1, keepdims=True) == y
        y_hard = npx.stop_gradient(y_hard - y) + y
        return y_hard
    else:
        return y
示例#2
0
def trunc_gumbel(logits, truncation):
    """Sample from the TruncGumbel distribution.

    The cumulative density function (CDF) of the Truncated Gumbel distribution is defined as

    TruncGumbel(\alpha, truncation) \prop max(Gumbel(\alpha), truncation)

    To sample from the distribution, we can use the CDF inversion technique.

    References:

        1. [NIPS2014] A* Sampling, https://papers.nips.cc/paper/5449-a-sampling.pdf
        2. https://cmaddis.github.io/gumbel-machinery

    Parameters
    ----------
    logits
        The logits. Shape (...,)
    truncation
        The truncation. Shape (...,)

    Returns
    -------
    samples
        Samples from the TruncGumbel(logits, truncation)
        Shape (...,)
    """
    gumbels = np.random.gumbel(np.zeros_like(logits)) + logits
    return -np.log(np.exp(-gumbels) + np.exp(-truncation))
示例#3
0
def relative_position_bucket(relative_position,
                             bidirectional: bool = True,
                             num_buckets: int = 32,
                             max_distance: int = 128):
    """Map the relative position to buckets. The implementation is consistent with that
    in [mesh_tensorflow](https://github.com/tensorflow/mesh/blob/c59988047e49b4d2af05603e3170724cdbadc467/mesh_tensorflow/transformer/transformer_layers.py#L595-L637)
    where relative position is defined as `mem_i - query_j`. Thus, a positive value indicates 
    that the memory slot is in a later timestamp than the query slot. 

    After handling the bidirectional case (see below), the implementation uses the first half 
    of buckets to store exact differences and the second half to store the differences after 
    a logrithmic transformation. 

    Parameters
    ----------
    relative_position
        Shape (...,)
    bidirectional
        Whether we are dealing with bidirectional attention.
        If it's bidirectional, positive shifts are mappd to [0, num_buckets // 2), 
        and negative shifts are mapped to [num_buckets // 2, num_buckets). 
    num_buckets
        The number of buckets.
    max_distance
        Maximum distance. Positions that fall outside of 'max_distance' will be trimmed.

    Returns
    -------
    buckets
        Shape (...,).
        It has the same shape as the `relative_position`. It will have int32 type.
    """
    ret = 0
    relative_position = -relative_position
    if bidirectional:
        assert num_buckets % 2 == 0, 'When bidirectional is True, the number of buckets must be ' \
                                     'divisible by 2.'
        num_buckets //= 2
        ret = ret + (relative_position < 0).astype(np.int32) * num_buckets
        relative_position = np.abs(relative_position)
    else:
        # Clip all the negative values to 0
        relative_position = np.clip(relative_position, a_min=0, a_max=None)
    # Now, the relative_position is in the range [0, inf)

    # Half of the buckets deal with the exact increments,
    # i.e., 0, 1, 2, ..., max_exact - 1, where max_exact = num_buckets // 2
    max_exact = num_buckets // 2
    is_small = relative_position < max_exact

    # The other half of the buckets are for logarithmically bigger bins in positions up to
    # max_distance
    val_if_large = max_exact + (
        np.log(relative_position.astype(np.float32) / max_exact) /
        math.log(max_distance / max_exact) *
        (num_buckets - max_exact)).astype(np.int32)
    val_if_large = np.minimum(val_if_large, num_buckets - 1)
    ret = ret + np.where(is_small, relative_position, val_if_large)
    return ret
def offset_boxes(anchors, assigned_bb, eps=1e-6):
    c_anc = d2l.box_corner_to_center(anchors)
    c_assigned_bb = d2l.box_corner_to_center(assigned_bb)
    offset_xy = 10 * (c_assigned_bb[:, :2] -
                      c_anc[:, :2]) / c_anc[:, 2:]  # standard deviation = 0.1
    offset_wh = 5 * np.log(
        eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])  # standard deviation = 0.2
    offset = np.concatenate([offset_xy, offset_wh], axis=1)
    print(offset.shape, 'das')
    return offset
def test_gammaln():
    A = np.ones((2, INT_OVERFLOW))
    A[0][0] = 5
    A.attach_grad()
    with mx.autograd.record():
        B = npx.gammaln(A)
    assert B.shape == (2, INT_OVERFLOW)
    assert_almost_equal(B[0][0], np.array([np.log(24)]), \
                rtol=1e-3, atol=1e-5)
    B.backward()
    assert A.grad.shape == (2, INT_OVERFLOW)
    assert_almost_equal(A.grad[0][0], np.array([1.5061178]), \
                rtol=1e-3, atol=1e-5)
示例#6
0
    def forward(self, length_predictions, labels):
        """
        Returns Poisson loss and output given data and expected integers as labels.

        :param length_predictions: Length predictions. Shape: (batch_size,).
        :param labels: Targets. Shape: (batch_size,).
        :return: Poisson loss of length predictions of the batch, and number of samples (batch size).
        """
        # (batch_size,)
        loss = length_predictions - labels * np.log(np.maximum(1e-10, length_predictions))
        # (1,)
        loss = np.sum(loss * self.weight)
        num_samples = np.sum(np.ones_like(length_predictions))
        return loss, num_samples
def test_power():
    A = np.full((2, INT_OVERFLOW), 2)
    B = np.ones((2, INT_OVERFLOW))
    B[-1, -1] = 3
    A.attach_grad()
    B.attach_grad()
    with mx.autograd.record():
        C = np.power(A, B)
        C.backward()
    assert C.shape == A.shape
    assert C[-1, -1] == 8
    assert A.grad.shape == A.shape
    assert A.grad[-1, -1] == 12
    assert B.grad.shape == B.shape
    assert_almost_equal(B.grad[-1, -1], 2**3 * np.log(2), rtol=1e-5, atol=1e-5)
def test_ldexp():
    A = np.ones((2, INT_OVERFLOW))
    B = np.ones((2, INT_OVERFLOW))
    A[-1, -1], B[-1, -1] = 5, 2
    A.attach_grad()
    B.attach_grad()
    with mx.autograd.record():
        C = np.ldexp(A, B)
        C.backward()
    assert C.shape == A.shape
    assert C[-1, -1] == 20
    assert A.grad.shape == A.shape
    assert A.grad[-1, -1] == 4
    assert B.grad.shape == B.shape
    assert_almost_equal(B.grad[-1, -1], A[-1, -1] * 2**B[-1, -1] * np.log(2), \
        rtol=1e-5, atol=1e-5)
示例#9
0
 def linear_interpolation(predictions):
     return -np.log(utils.average_arrays(predictions))  # pylint: disable=invalid-unary-operand-type
示例#10
0
 def _init_sinusoidal_base(units):
     half_units = units // 2
     val = np.log(10000) / (half_units - 1)
     val = np.exp(np.arange(half_units, dtype=np.float32) * -val)
     return val
示例#11
0
def log_rmse(net, features, labels):
    #To further stabilize the value when the logarithm is taken, set the
    #value less than 1 as 1
    clipped_preds = np.clip(net(features), 1, float('inf'))
    return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())
示例#12
0
    def dynamic_masking(self, input_ids, valid_lengths):
        # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked`
        # that control the masking status for each positions in the sequence.
        """
        Generate masking positions on-the-fly instead of during preprocessing
        Parameters
        ----------
        input_ids
            The batchified input_ids with shape (batch_size, max_seq_length)
        valid_lengths
            The batchified valid_lengths with shape (batch_size, )
        Returns
        ------
        masked_input_ids
            The masked input sequence with 15% tokens are masked with [MASK]
            shape (batch_size, max_seq_length)
        length_masks
            The masking matrix for the whole sequence that indicates the positions
            are greater than valid_length.

            shape (batch_size, max_seq_length)
        unmasked_tokens
            The original tokens that appear in the unmasked input sequence
            shape (batch_size, num_masked_positions)
        masked_positions
            The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions)
            shape (batch_size, num_masked_positions)
        masked_lm_weights
            The weight matrix containing 0 or 1 to mark the actual effect of masked positions
            shape (batch_size, num_masked_positions)
        """
        N = self._max_num_masked_position
        # Only valid token without special token are allowed to mask
        valid_candidates = np.ones_like(input_ids, dtype=np.bool)
        ignore_tokens = [
            self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id
        ]

        for ignore_token in ignore_tokens:
            # TODO(zheyuye), Update when operation += supported
            valid_candidates = valid_candidates * \
                np.not_equal(input_ids, ignore_token)
        valid_lengths = valid_lengths.astype(np.float32)
        valid_candidates = valid_candidates.astype(np.float32)
        num_masked_position = mxnp.maximum(
            1, np.minimum(N, round(valid_lengths * self._mask_prob)))

        # Get the masking probability of each position
        sample_probs = self._proposal_distribution * valid_candidates
        sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True)
        sample_probs = npx.stop_gradient(sample_probs)
        gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs))
        # Following the instruction of official repo to avoid deduplicate postions
        # with Top_k Sampling as https://github.com/google-research/electra/issues/41
        masked_positions = npx.topk(mxnp.log(sample_probs) + gumbels,
                                    k=N,
                                    axis=-1,
                                    ret_typ='indices',
                                    dtype=np.int32)

        masked_weights = npx.sequence_mask(mxnp.ones_like(masked_positions),
                                           sequence_length=num_masked_position,
                                           use_sequence_length=True,
                                           axis=1,
                                           value=0)
        masked_positions = masked_positions * masked_weights
        length_masks = npx.sequence_mask(mxnp.ones_like(input_ids,
                                                        dtype=np.float32),
                                         sequence_length=valid_lengths,
                                         use_sequence_length=True,
                                         axis=1,
                                         value=0)
        unmasked_tokens = select_vectors_by_position(
            input_ids, masked_positions) * masked_weights
        masked_weights = masked_weights.astype(np.float32)
        replaced_positions = (mxnp.random.uniform(
            mxnp.zeros_like(masked_positions), mxnp.ones_like(
                masked_positions)) < self._replace_prob) * masked_positions
        # dealing with multiple zero values in replaced_positions which causes
        # the [CLS] being replaced
        filled = mxnp.where(replaced_positions, self.vocab.mask_id,
                            self.vocab.cls_id).astype(np.int32)
        # Masking token by replacing with [MASK]
        masked_input_ids = update_vectors_by_position(input_ids, filled,
                                                      replaced_positions)

        # Note: It is likely have multiple zero values in masked_positions if number of masked of
        # positions not reached the maximum. However, this example hardly exists since valid_length
        # is almost always equal to max_seq_length
        masked_input = self.MaskedInput(input_ids=masked_input_ids,
                                        masks=length_masks,
                                        unmasked_tokens=unmasked_tokens,
                                        masked_positions=masked_positions,
                                        masked_weights=masked_weights)
        return masked_input
示例#13
0
文件: 3-6.py 项目: maniacalmm/d2l_ex
def cross_entropy(y_hat, y):
    # picking here is an implicit multiplying, since all other entry in y are 0
    # the point to take here is that y is an array with only 0 and 1, and only
    # one of them is 1, basically a masking array
    return - np.log(y_hat[range(len(y_hat)), y])
示例#14
0
def loss(y_hat, y):
    m = y.shape[0]
    p = softmax(y_hat)
    return np.sum(-np.log(p[range(m), y]))
示例#15
0
from concurrent.futures import ProcessPoolExecutor, as_completed

import d2l
import tqdm
from mxnet import autograd, gluon, init, np, npx
from mxnet.gluon import nn
from mxnet.gluon.utils import split_and_load

import logger

npx.set_np()

log = logger.get_logger('BERT')

INVALID_THD = 10  # Invalid throughput threshold ratio.
INVALID_LOG_THD = np.log(INVALID_THD)

### Class Declarations


class BERTEncoder(nn.Block):
    """BERT Encoder class."""
    def __init__(self, num_hiddens, ffn_num_hiddens, num_heads, num_layers,
                 dropout, **kwargs):
        super(BERTEncoder, self).__init__(**kwargs)
        self.blks = nn.Sequential()
        for _ in range(num_layers):
            self.blks.add(
                d2l.EncoderBlock(num_hiddens, ffn_num_hiddens, num_heads,
                                 dropout))
 def forward(self, positive, negative):
     distances = positive - negative
     loss = - np.sum(np.log(npx.sigmoid(distances)), 0, keepdims=True)
     return loss
示例#17
0
def cross_entropy(y_hat, y):
    return -np.log(y_hat[range(len(y_hat)), y])
示例#18
0
 def log_linear_interpolation(predictions):
     log_probs = utils.average_arrays([np.log(p) for p in predictions])
     return -npx.log_softmax(log_probs)  # pylint: disable=invalid-unary-operand-type
示例#19
0
def log_rmse(net, features, labels):
    # to futher stabilize the value when the log is taken
    # set the value less than 1 as 1
    net_out = net(features)
    clipped_preds = np.clip(net_out, 1, float('inf'))
    return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())