Exemplo n.º 1
0
def get_rmse_log(net, X_train, y_train):
    """Gets root mse between the logarithms of the prediction and the truth."""
    num_train = X_train.shape[0]
    clipped_preds = np.clip(net(X_train), 1, float('inf'))
    return np.sqrt(
        2 *
        np.sum(square_loss(np.log(clipped_preds), np.log(y_train))).item() /
        num_train)
def multibox_prior(data, sizes, ratios):
    #data: batch, channels, height, width
    in_height, in_width = data.shape[-2:]

    device, num_sizes, num_ratios = data.ctx, len(sizes), len(ratios)
    boxes_per_pixel = num_sizes + num_ratios - 1
    size_tensor = np.array(sizes, ctx=device)
    ratio_tensor = np.array(ratios, ctx=device)

    # Offsets are required to move the anchor to center of a pixel
    # Since pixel (height=1, width=1), we choose to offset our centers by 0.5
    offset_w, offset_h = 0.5, 0.5
    steps_h = 1.0 / in_height  # Scaled steps in y axis
    steps_w = 1.0 / in_width  # Scaled steps in x axis

    # Generate all center points for the anchor boxes
    center_h = (np.arange(in_height, ctx=device) + offset_h) * steps_h
    center_w = (np.arange(in_width, ctx=device) + offset_w) * steps_w
    shift_x, shift_y = np.meshgrid(center_w, center_h)
    shift_x, shift_y = shift_x.reshape(-1), shift_y.reshape(-1)

    # Generate boxes_per_pixel number of heights and widths which are later
    # used to create anchor box corner coordinates (xmin, xmax, ymin, ymax)
    # concat (various sizes, first ratio) and (first size, various ratios)

    w = np.concatenate((size_tensor * np.sqrt(ratio_tensor[0]),
                        size_tensor[0]* np.sqrt(ratio_tensor[1:])))\
                        * in_height / in_width

    h = np.concatenate((size_tensor / np.sqrt(ratio_tensor[0]),
                        sizes[0] / np.sqrt(ratio_tensor[1:])))

    # Divide by 2 to get half height and half width
    anchor_manipulations = np.tile(
        np.stack((-w, -h, w, h)).T, (in_height * in_width, 1)) / 2

    # Each center point will have boxes_per_pixel number of anchor boxes, so
    # generate grid of all anchor box centers with boxes_per_pixel repeats
    out_grid = np.stack([shift_x, shift_y, shift_x, shift_y],
                        axis=1).repeat(boxes_per_pixel, axis=0)

    output = out_grid + anchor_manipulations
    # print(output)
    print(in_height, in_width)
    return np.expand_dims(output, axis=0)
Exemplo n.º 3
0
def evaluator(network, inter_matrix, test_data, ctx):
    scores = []
    for values in inter_matrix:
        feat = gluon.utils.split_and_load(values, ctx, even_split=False)
        scores.extend([network(i).asnumpy() for i in feat])
    recons = np.array([item for sublist in scores for item in sublist])
    # Calculate the test RMSE.
    rmse = np.sqrt(
        np.sum(np.square(test_data - np.sign(test_data) * recons)) /
        np.sum(np.sign(test_data)))
    return float(rmse)
Exemplo n.º 4
0
def log_rmse(net, features, labels):
    #To further stabilize the value when the logarithm is taken, set the
    #value less than 1 as 1
    clipped_preds = np.clip(net(features), 1, float('inf'))
    return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())
Exemplo n.º 5
0
    def __init__(self,
                 d_model,
                 d_kv,
                 d_ff,
                 is_decoder,
                 num_heads=12,
                 dropout_prob=0.1,
                 layer_norm_eps=1E-6,
                 activation='relu',
                 init_factor=1.0,
                 layout='NT',
                 dtype='float32'):
        super().__init__()
        self._d_model = d_model
        self._d_kv = d_kv
        self._d_ff = d_ff
        self._is_decoder = is_decoder
        self._num_heads = num_heads
        self._inner_dim = self._num_heads * self._d_kv
        self._dtype = dtype
        assert layout in ['TN', 'NT'], \
            'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout)
        self._layout = layout
        self._time_axis = 1 if self.layout == 'NT' else 0

        self.self_attn_layer_norm = RMSNorm(in_channels=d_model,
                                            center=False,
                                            scale=True,
                                            gamma_initializer=Constant(
                                                1.0 * init_factor),
                                            variance_epsilon=layer_norm_eps,
                                            dtype=dtype)
        # avoid scaling before softmax
        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
        self.self_attn_q = nn.Dense(units=self._inner_dim,
                                    in_units=d_model,
                                    flatten=False,
                                    use_bias=False,
                                    weight_initializer=Normal(
                                        (d_model * d_kv)**-0.5 * init_factor),
                                    dtype=dtype)
        self.self_attn_k = nn.Dense(units=self._inner_dim,
                                    in_units=d_model,
                                    flatten=False,
                                    use_bias=False,
                                    weight_initializer=Normal(d_model**-0.5 *
                                                              init_factor),
                                    dtype=dtype)
        self.self_attn_v = nn.Dense(units=self._inner_dim,
                                    in_units=d_model,
                                    flatten=False,
                                    use_bias=False,
                                    weight_initializer=Normal(d_model**-0.5 *
                                                              init_factor),
                                    dtype=dtype)
        self.self_attn = MultiHeadAttentionCell(
            query_units=self._inner_dim,
            num_heads=num_heads,
            attention_dropout=dropout_prob,
            scaled=False,
            normalized=False,
            dtype=dtype,
            layout='NTK' if layout == 'NT' else 'TNK',
            use_einsum=False)
        self.self_attn_proj = nn.Dense(
            units=d_model,
            in_units=self._inner_dim,
            flatten=False,
            use_bias=False,
            weight_initializer=Normal(self._inner_dim**-0.5 * init_factor),
            dtype=dtype)
        if is_decoder:
            self.cross_attn_layer_norm = RMSNorm(
                in_channels=d_model,
                center=False,
                scale=True,
                gamma_initializer=Constant(1.0 * init_factor),
                variance_epsilon=layer_norm_eps,
                dtype=dtype)
            # avoid scaling before softmax
            self.cross_attn_q = nn.Dense(
                units=self._inner_dim,
                in_units=d_model,
                flatten=False,
                use_bias=False,
                weight_initializer=Normal(
                    (d_model * d_kv)**-0.5 * init_factor),
                dtype=dtype)
            self.cross_attn_k = nn.Dense(units=self._inner_dim,
                                         in_units=d_model,
                                         flatten=False,
                                         use_bias=False,
                                         weight_initializer=Normal(
                                             d_model**-0.5 * init_factor),
                                         dtype=dtype)
            self.cross_attn_v = nn.Dense(units=self._inner_dim,
                                         in_units=d_model,
                                         flatten=False,
                                         use_bias=False,
                                         weight_initializer=Normal(
                                             d_model**-0.5 * init_factor),
                                         dtype=dtype)
            self.cross_attn = MultiHeadAttentionCell(
                query_units=self._inner_dim,
                num_heads=num_heads,
                attention_dropout=dropout_prob,
                scaled=False,
                normalized=False,
                dtype=dtype,
                layout='NTK' if layout == 'NT' else 'TNK',
                use_einsum=False)
            self.cross_attn_proj = nn.Dense(
                units=d_model,
                in_units=self._inner_dim,
                flatten=False,
                use_bias=False,
                weight_initializer=Normal(self._inner_dim**-0.5 * init_factor),
                dtype=dtype)
        assert activation in ['relu', 'gated-gelu'], \
            '{} is not supported. Please choose from "relu" and "gated-gelu"'.format(activation)
        # the weight_initializer here is equivalent to Normal(in_units ** -0.5 * init_factor)
        self.ffn = PositionwiseFFN(
            units=d_model,
            hidden_size=d_ff,
            use_bias=False,
            activation_dropout=dropout_prob,
            dropout=dropout_prob,
            weight_initializer=Xavier('gaussian', 'in', np.sqrt(init_factor)),
            activation='relu' if activation == 'relu' else 'gelu(tanh)',
            use_gated_activation=False if activation == 'relu' else True,
            normalization='rms_norm',
            layer_norm_eps=layer_norm_eps,
            pre_norm=True,
            dtype=dtype,
            center=False,
            scale=True,
            gamma_initializer=Constant(1.0 * init_factor))
        self.dropout = nn.Dropout(dropout_prob)
Exemplo n.º 6
0
def multi_head_dot_attn(query,
                        key,
                        value,
                        mask=None,
                        edge_scores=None,
                        dropout: float = 0.0,
                        scaled: bool = True,
                        normalized: bool = False,
                        eps: float = 1E-6,
                        query_head_units: Optional[int] = None,
                        layout: str = 'NKT',
                        use_einsum: bool = False,
                        dtype=np.float32):
    """Multihead dot product attention between the query, key, value.

    scaled is False, normalized is False:
        D(h_q, h_k) = <h_q, h_k>
    scaled is True, normalized is False:
        D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q)
    scaled is False, normalized is True:
        D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||>
    scaled is True, normalized is True:
        D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> / sqrt(dim_q)

    If edge_scores is provided, we will calcualte the attention as
        scores = D(h_q, h_k) + EdgeScore_{q, k}

    Parameters
    ----------
    query
        Query. The shape depends on the layout
        - layout is 'NKT'
            Shape (batch_size, num_heads, query_length, key_dim)
        - layout is 'NTK'
            Shape (batch_size, query_length, num_heads, key_dim)
        - layout is 'TNK'
            Shape (query_length, batch_size, num_heads, key_dim)
    key
        Key. The shape depends on the layout
        - layout is 'NKT'
            Shape (batch_size, num_heads, mem_length, key_dim)
        - layout is 'NTK'
            Shape (batch_size, mem_length, num_heads, key_dim)
        - layout is 'TNK'
            Shape (mem_length, batch_size, num_heads, key_dim)
    value
        Value. The shape depends on the layout
        - layout is 'NKT'
            Shape (batch_size, num_heads, mem_length, value_dim)
        - layout is 'NTK'
            Shape (batch_size, mem_length, num_heads, value_dim)
        - layout is 'TNK'
            Shape (mem_length, batch_size, num_heads, value_dim)
    mask
        Mask between query and memory. Shape (batch_size, query_length, mem_length)
    edge_scores
        The edge attention score. Shape can be any shape that is broadcastable to
        (batch_size, num_heads, query_length, mem_length)
    dropout
        Dropout rate
    scaled
        Whether to divide the attention weights by the sqrt of the query dimension.
        This is first proposed in "[NIPS2017] Attention is all you need."::

            score = <h_q, h_k> / sqrt(dim_q)

    normalized
        If turned on, the cosine distance is used, i.e::

            score = <h_q / ||h_q||, h_k / ||h_k||>

    eps
        The epsilon value used in L2 normalization
    query_head_units
        The units of each query head. If it's empty, we will estimate it via the
        shape_array of the query.
    layout
        This stands for the layout of the attention cell. The shape of the input/output will depend
        on the layout. Currently, we support 'NKT', 'NTK' and 'TNK' in which
        'N' means the batch_size, 'K' means the head, and 'T' means the length dimension.
    use_einsum
        Whether to use einsum for the computation

    Returns
    -------
    context_vec
        - layout is 'NKT' or 'NTK'
            Shape (batch_size, query_length, num_heads * value_units)
        - layout is 'TNK'
            Shape (query_length, batch_size, num_heads * value_units)
    additional_info
        scores:
            Shape (batch_size, num_head, query_length, mem_length)
        attn_weight:
            Shape (batch_size, num_head, query_length, mem_length)
    """
    # TODO(sxjscience) Profile layout
    if normalized:
        query = l2_normalize(query, axis=-1, eps=eps)
        key = l2_normalize(key, axis=-1, eps=eps)
    if scaled:
        if query_head_units is None:
            query_shape = npx.shape_array(query)
            scale = np.sqrt(query_shape[-1])
        else:
            scale = math.sqrt(query_head_units)
    else:
        scale = None
    if layout == 'NKT':
        # 1. Expand the dimension of the mask:
        #   (B, L_query, L_mem) --> (B, 1, L_query, L_mem)
        if mask is not None:
            mask = np.expand_dims(mask, axis=1)
        # 2. Calculate the attention weights
        #   Score: (B, N, L_query, C_Q) X (B, N, L_mem, C_Q) --> (B, N, L_query, L_mem)
        scores = npx.batch_dot(query, key, transpose_b=True)
        if edge_scores is not None:
            scores = scores + edge_scores
        if scaled:
            scores = scores / scale
        attn_weights = masked_softmax(scores, mask, dtype=dtype, axis=-1)
        attn_weights = npx.dropout(attn_weights, p=dropout)
        # 3. Calculate the context vector
        # (B, N, L_query, L_mem) X (B, N, L_mem, C_V) --> (B, L_query, N * C_V)
        if use_einsum:
            context_vec = np.einsum('bnij,bnjc->binc', attn_weights, value)
        else:
            context_vec = npx.batch_dot(attn_weights, value).transpose(
                (0, 2, 1, 3))
        context_vec = npx.reshape(context_vec, (-2, -2, -1))
    elif layout == 'NTK':
        # 1. Expand the dimension of the mask:
        #   (B, L_query, L_mem) --> (B, 1, L_query, L_mem)
        if mask is not None:
            mask = np.expand_dims(mask, axis=1)
        # 2. Calculate the attention weights
        #   Score: (B, L_query, N, C_Q) X (B, L_mem, N, C_Q) --> (B, N, L_query, L_mem)
        if use_einsum:
            scores = np.einsum('binc,bjnc->bnij', query, key)
        else:
            scores = npx.batch_dot(np.swapaxes(query, 1, 2),
                                   np.swapaxes(key, 1, 2),
                                   transpose_b=True)
        if edge_scores is not None:
            scores = scores + edge_scores
        if scaled:
            scores = scores / scale
        attn_weights = masked_softmax(scores, mask, dtype=dtype)
        attn_weights = npx.dropout(attn_weights, p=dropout)
        # 3. Calculate the context vector
        # (B, N, L_query, L_mem) X (B, L_mem, N, C_V) --> (B, L_query, N * C_V)
        if use_einsum:
            context_vec = np.einsum('bnij,bjnc->binc', attn_weights, value)
        else:
            context_vec = npx.batch_dot(attn_weights,
                                        np.swapaxes(value, 1, 2)).transpose(
                                            (0, 2, 1, 3))
        context_vec = npx.reshape(context_vec, (-2, -2, -1))
    elif layout == 'TNK':
        # 1. Expand the dimension of the mask:
        #   (B, L_query, L_mem) --> (B, 1, L_query, L_mem)
        if mask is not None:
            mask = np.expand_dims(mask, axis=1)
        # 2. Calculate the attention weights
        #   Score: (L_query, B, N, C_Q) X (L_mem, B, N, C_Q) --> (B, N, L_query, L_mem)
        #   This layout structure can be implemented very efficiently because B, N are consecutive
        #   to each other. To have a clear picture of what's happening, we may consider the
        #   (i, j)th element of the output
        #       out[i, j, :, :] = query[:, i, j, :] X key[:, i, j, :].T, which is just one GEMM call
        #   We can thus implement the whole kernel via a single call of batched GEMM with stride.
        if use_einsum:
            scores = np.einsum('ibnc,jbnc->bnij', query, key)
        else:
            scores = npx.batch_dot(query.transpose((1, 2, 0, 3)),
                                   key.transpose((1, 2, 3, 0)))
        if edge_scores is not None:
            scores = scores + edge_scores
        if scaled:
            scores = scores / scale
        attn_weights = masked_softmax(scores, mask, dtype=dtype)
        attn_weights = npx.dropout(attn_weights, p=dropout)
        # 3. Calculate the context vector
        # (B, N, L_query, L_mem) X (L_mem, B, N, C_V) --> (L_query, B, N * C_V)
        # Again, we can implement it via a single call to batched GEMM with stride.

        # Shape (B, N, L_query, C_V)
        if use_einsum:
            context_vec = np.einsum('bnij,jbnc->ibnc', attn_weights, value)
        else:
            context_vec = npx.batch_dot(attn_weights,
                                        value.transpose(
                                            (1, 2, 0, 3))).transpose(
                                                (2, 0, 1, 3))
        context_vec = npx.reshape(context_vec, (-2, -2, -1))
    else:
        raise NotImplementedError(
            'layout="{}" is not supported! '
            'We only support layout = "NKT", "NTK", and "TNK".'.format(layout))
    return context_vec, [scores, attn_weights]
Exemplo n.º 7
0
def dot_attn_score(query,
                   key,
                   scaled=True,
                   normalized=False,
                   eps=1E-6,
                   layout='NT'):
    """The inner function call to calculate the score used in dot-product attention.

    We support multiple leading batch dimensions.

    scaled is True:
        D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q)

    normalized is True:
            D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||>

    both scaled and normalized:
            D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> / sqrt(dim_q)

    Parameters
    ----------
    query : symbol or ndarray
        - layout is 'NT'
            (B0, ..., BN, query_length, query_dim)
        - layout is 'TN'
            (query_length, B0, ..., BN, query_dim)
    key : symbol or ndarray
        - layout is 'NT'
            (B0, ..., BN, key_length, key_dim)
        - layout is 'TN'
            (key_length, B0, ..., BN, key_dim)
    scaled : bool
        Whether to divide the query by the square-root of the query_dim
        If True: D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q)
    normalized : bool
        Whether to normalize the query and the key embeddings
        If True: D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||>
    eps : float
        The epsilon used in the normalization
    layout
        The layout of the layer. Can be 'TN' or 'NT'.

    Returns
    -------
    scores : symbol or ndarray
        (B0, ..., BN, query_length, key_length)
    """
    if normalized:
        query = l2_normalize(query, -1, eps=eps)
        key = l2_normalize(key, -1, eps=eps)
    if scaled:
        query_shape = npx.shape_array(query)
        # TODO(sxjscience) Remove .astype(np.float32).
        #  Wait for https://github.com/apache/incubator-mxnet/issues/18084
        query_units = query_shape[-1].astype(np.float32)
        query = query / np.sqrt(query_units)
    if layout == 'NT':
        scores = npx.batch_dot(query, key, transpose_b=True)
    else:
        raise NotImplementedError(
            'layout={} is not supported.'
            ' Currently, only layout = "NT" is implemented!'.format(layout))
    return scores
Exemplo n.º 8
0
 def forward(self, data):
     var = np.power(data, 2).mean(-1, keepdims=True)
     data = data * np.reciprocal(np.sqrt(var + self._epsilon))
     return data * self.gamma.data() + self.beta.data()
Exemplo n.º 9
0
 def forward(self, x): 
     var = np.power(x.astype('float32'), 2).mean(-1, keepdims=True)
     x = x * np.reciprocal(np.sqrt(var + self.variance_epsilon))
     if self.gemma.dtype == 'float16': 
         x = x.astype('float16')
     return self.gemma * x
Exemplo n.º 10
0
def log_rmse(net, features, labels):
    # to futher stabilize the value when the log is taken
    # set the value less than 1 as 1
    net_out = net(features)
    clipped_preds = np.clip(net_out, 1, float('inf'))
    return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())