Exemplo n.º 1
0
    def forward(self, inputs):
        self.retain_inputs((0, ))
        xp = backend.get_array_module(*inputs)
        x, gy = inputs
        self._gy_shape = gy.shape
        gW = xp.zeros(self.w_shape, dtype=gy.dtype)

        if xp is numpy:
            # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is
            # too slow.
            for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)):
                if ix == self.ignore_label:
                    continue
                gW[ix] += igy
        else:
            utils.nondeterministic('atomicAdd')
            if self.ignore_label is None:
                cuda.elementwise(
                    'T gy, S x, S n_out', 'raw T gW',
                    'ptrdiff_t w_ind[] = {x, i % n_out};'
                    'atomicAdd(&gW[w_ind], gy)',
                    'embed_id_bwd')(gy, xp.expand_dims(x, -1), gW.shape[1], gW)
            else:
                cuda.elementwise(
                    'T gy, S x, S n_out, S ignore', 'raw T gW', '''
                    if (x != ignore) {
                      ptrdiff_t w_ind[] = {x, i % n_out};
                      atomicAdd(&gW[w_ind], gy);
                    }
                    ''',
                    'embed_id_bwd_ignore_label')(gy, xp.expand_dims(x, -1),
                                                 gW.shape[1],
                                                 self.ignore_label, gW)
        return gW,
Exemplo n.º 2
0
    def label_probability(self, label_size, path, path_length, multiply_seq,
                          xp):
        seq_length = len(multiply_seq)
        n_batch = len(path)
        dtype = multiply_seq.dtype

        ret = xp.zeros((seq_length, n_batch, label_size), dtype)
        if xp == numpy:
            for b in six.moves.range(len(path)):
                target_path = path[b, :path_length[b]]
                chars = {c for c in target_path}
                for c in chars:
                    ret[:, b, c] = xp.sum(
                        multiply_seq[:, b, 0:path_length[b]][:,
                                                             target_path == c],
                        axis=1)
        else:
            utils.nondeterministic('atomicAdd')
            cuda.elementwise(
                'T prob, I path, I path_length, I max_path_length',
                'raw T cum_prob', '''
                I t = i % max_path_length;
                if (t < path_length) {
                  int n_batch = cum_prob.shape()[1];
                  I s = i / (max_path_length * n_batch);
                  I b = (i - s * (max_path_length * n_batch))
                      / max_path_length;
                  int ind[] = {s, b, path};
                  atomicAdd(&cum_prob[ind], prob);
                }
                ''', 'ctc_label_prob_sum')(multiply_seq, path,
                                           path_length[:, None], path.shape[1],
                                           ret)
        return ret
Exemplo n.º 3
0
    def label_probability(self, label_size, path, path_length,
                          multiply_seq, xp):
        seq_length = len(multiply_seq)
        n_batch = len(path)
        dtype = multiply_seq.dtype

        ret = xp.zeros((seq_length, n_batch, label_size), dtype)
        if xp == numpy:
            for b in six.moves.range(len(path)):
                target_path = path[b, :path_length[b]]
                chars = {c for c in target_path}
                for c in chars:
                    ret[:, b, c] = xp.sum(
                        multiply_seq[:, b, 0:path_length[b]]
                        [:, target_path == c], axis=1)
        else:
            utils.nondeterministic('atomicAdd')
            cuda.elementwise(
                'T prob, I path, I path_length, I max_path_length',
                'raw T cum_prob',
                '''
                I t = i % max_path_length;
                if (t < path_length) {
                  int n_batch = cum_prob.shape()[1];
                  I s = i / (max_path_length * n_batch);
                  I b = (i - s * (max_path_length * n_batch))
                      / max_path_length;
                  int ind[] = {s, b, path};
                  atomicAdd(&cum_prob[ind], prob);
                }
                ''', 'ctc_label_prob_sum'
            )(multiply_seq, path, path_length[:, None], path.shape[1], ret)
        return ret
Exemplo n.º 4
0
    def forward_gpu(self, inputs):
        utils.nondeterministic('atomicAdd')
        self.retain_inputs((0, 1, 2))
        x, W, gy = inputs

        if self.reduce == 'no':
            gy = gy[:, None]

        samples = self.samples
        wx = self.wx.astype(x.dtype, copy=False)
        g = cuda.elementwise(
            'T wx, T gy, int32 m', 'T g',
            '''
            T y;
            if (i % m == 0) {
              y = 1;
            } else {
              y = -1;
            }

            g = -y * gy / (1.0f + __expf(wx * y));
            ''',
            'negative_sampling_calculate_g'
        )(wx, gy, self.sample_size + 1)

        cupy = cuda.cupy
        gx = cupy.zeros_like(x)
        n_in = x.shape[1]
        cuda.elementwise(
            'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx',
            '''
            int d = i / c;
            T w = 0;
            if (mask == 1){
                for (int j = 0; j < m; ++j) {
                  w += g[d * m + j] * W[k[d * m + j] * c + i % c];
                }
            }
            gx = w;
            ''',
            'negative_sampling_calculate_gx'
        )(g, W, self.ignore_mask[:, None], samples, n_in,
          self.sample_size + 1, gx)

        gW = cupy.zeros_like(W)
        cuda.elementwise(
            'T g, raw T x, S k, bool mask, int32 c, int32 m',
            'raw T gW',
            '''
            T gi = g;
            if (mask == 1) {
                for (int j = 0; j < c; ++j) {
                  atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]);
                }
            }
            ''',
            'negative_sampling_calculate_gw'
        )(g, x, samples, self.ignore_mask[:, None], n_in,
          self.sample_size + 1, gW)
        return gx, None, gW
    def backward_gpu(self, inputs, gy):
        utils.nondeterministic('atomicAdd')
        bottom_rois, bottom_roi_indices = inputs[1:]
        channels, height, width = self._bottom_data_shape[1:]
        bottom_diff = cuda.cupy.zeros(self._bottom_data_shape,
                                      bottom_rois.dtype)

        cuda.elementwise(
            '''
            raw T top_diff, raw int32 argmax_data,
            raw T bottom_rois, raw int32 bottom_roi_indices, int32 num_rois,
            T spatial_scale, int32 channels, int32 height, int32 width,
            int32 pooled_height, int32 pooled_width
            ''', 'raw T bottom_diff', '''
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int n = i / pooled_width / pooled_height / channels;

            int roi_batch_ind = bottom_roi_indices[n];
            int bottom_diff_offset =
                (roi_batch_ind * channels + c) * height * width;
            int top_diff_offset =
                (n * channels + c) * pooled_height * pooled_width;

            int max_index =
                argmax_data[top_diff_offset + ph * pooled_width + pw];
            if (max_index != -1) {
                atomicAdd(
                    &bottom_diff[bottom_diff_offset + max_index],
                    top_diff[top_diff_offset + ph * pooled_width + pw]);
            }
            ''', 'roi_max_pooling_2d_bwd')(gy[0],
                                           self.argmax_data,
                                           bottom_rois,
                                           bottom_roi_indices,
                                           bottom_rois.shape[0],
                                           self.spatial_scale,
                                           channels,
                                           height,
                                           width,
                                           self.outh,
                                           self.outw,
                                           bottom_diff,
                                           size=gy[0].size)

        return bottom_diff, None, None
Exemplo n.º 6
0
def _cupy_coo_matmul():
    utils.nondeterministic('atomicAdd')
    return cuda.elementwise(
        'int32 nb, int32 _m, int32 _n, int32 _k, int32 nnz, int32 chunk, \
         raw A A_data, raw T A_row, raw T A_col, \
         raw B _B',
        'raw C _C',
        '''
        int i_n = (i % _n);
        int i0 = (i / _n) * chunk;
        int i_C = -1;
        C val_C = 0;
        for (int i1 = 0; i1 < chunk; i1++) {
            int i_A = i0 + i1;
            int i_b = i_A / nnz;
            if (i_b >= nb) {
                continue;
            }
            int i_k = A_col[i_A];
            if (i_k < 0) {
                continue;
            }
            assert(i_k < _k);
            int i_m = A_row[i_A];
            if (i_m < 0) {
                continue;
            }
            assert(i_m < _m);
            int i_B = i_n + _n * (i_k + _k * i_b);
            int i_C_now = i_n + _n * (i_m + _m * i_b);
            A val_A = A_data[i_A];
            B val_B = _B[i_B];
            C val_C_now = static_cast<C>(val_A * val_B);
            if (i_C >= 0 && i_C != i_C_now) {
                atomicAdd(&_C[i_C], val_C);
                val_C = 0;
            }
            i_C = i_C_now;
            val_C += val_C_now;
        }
        if (i_C >= 0) {
            atomicAdd(&_C[i_C], val_C);
        }
        ''',
        'coo_matmul')
Exemplo n.º 7
0
def _cupy_coo_matmul():
    utils.nondeterministic('atomicAdd')
    return cuda.elementwise(
        'int32 nb, int32 _m, int32 _n, int32 _k, int32 nnz, int32 chunk, \
         raw A A_data, raw T A_row, raw T A_col, \
         raw B _B', 'raw C _C', '''
        int i_n = (i % _n);
        int i0 = (i / _n) * chunk;
        int i_C = -1;
        C val_C = 0;
        for (int i1 = 0; i1 < chunk; i1++) {
            int i_A = i0 + i1;
            int i_b = i_A / nnz;
            if (i_b >= nb) {
                continue;
            }
            int i_k = A_col[i_A];
            if (i_k < 0) {
                continue;
            }
            assert(i_k < _k);
            int i_m = A_row[i_A];
            if (i_m < 0) {
                continue;
            }
            assert(i_m < _m);
            int i_B = i_n + _n * (i_k + _k * i_b);
            int i_C_now = i_n + _n * (i_m + _m * i_b);
            A val_A = A_data[i_A];
            B val_B = _B[i_B];
            C val_C_now = static_cast<C>(val_A * val_B);
            if (i_C >= 0 && i_C != i_C_now) {
                atomicAdd(&_C[i_C], val_C);
                val_C = 0;
            }
            i_C = i_C_now;
            val_C += val_C_now;
        }
        if (i_C >= 0) {
            atomicAdd(&_C[i_C], val_C);
        }
        ''', 'coo_matmul')
Exemplo n.º 8
0
    def backward_gpu(self, inputs, gy):
        utils.nondeterministic('atomicAdd')
        bottom_rois, bottom_roi_indices = inputs[1:]
        channels, height, width = self._bottom_data_shape[1:]
        bottom_diff = cuda.cupy.zeros(
            self._bottom_data_shape, bottom_rois.dtype)

        cuda.elementwise(
            '''
            raw T top_diff, raw int32 argmax_data,
            raw T bottom_rois, raw int32 bottom_roi_indices, int32 num_rois,
            T spatial_scale, int32 channels, int32 height, int32 width,
            int32 pooled_height, int32 pooled_width
            ''',
            'raw T bottom_diff',
            '''
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int n = i / pooled_width / pooled_height / channels;

            int roi_batch_ind = bottom_roi_indices[n];
            int bottom_diff_offset =
                (roi_batch_ind * channels + c) * height * width;
            int top_diff_offset =
                (n * channels + c) * pooled_height * pooled_width;

            int max_index =
                argmax_data[top_diff_offset + ph * pooled_width + pw];
            if (max_index != -1) {
                atomicAdd(
                    &bottom_diff[bottom_diff_offset + max_index],
                    top_diff[top_diff_offset + ph * pooled_width + pw]);
            }
            ''', 'roi_max_pooling_2d_bwd'
        )(gy[0], self.argmax_data, bottom_rois, bottom_roi_indices,
          bottom_rois.shape[0], self.spatial_scale, channels, height, width,
          self.outh, self.outw, bottom_diff, size=gy[0].size)

        return bottom_diff, None, None
Exemplo n.º 9
0
    def backward_gpu(self, inputs, grad_outputs):
        utils.nondeterministic('atomicAdd')
        x, t, W = inputs
        gloss, = grad_outputs

        n_in = x.shape[1]
        gx = cuda.cupy.zeros_like(x)
        gW = cuda.cupy.zeros_like(W)
        cuda.elementwise(
            '''T wxy, raw T x, raw T w, raw int32 ts, raw int32 paths,
            raw T codes, raw int32 begins, raw T gloss,
            int32 c, int32 max_length''',
            'raw T gx, raw T gw',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];
              T code = codes[p];

              T g = -gloss[0] * code / (1.0 + exp(wxy));
              for (int j = 0; j < c; ++j) {
                int w_ind[] = {node, j};
                int x_ind[] = {ind, j};
                atomicAdd(&gx[x_ind], g * w[w_ind]);
                atomicAdd(&gw[w_ind], g * x[x_ind]);
              }
            }
            ''',
            'binary_hierarchical_softmax_bwd'
        )(self.wxy, x, W, t, self.paths, self.codes, self.begins, gloss, n_in,
          self.max_length, gx, gW)
        return gx, None, gW
Exemplo n.º 10
0
    def forward(self, inputs):
        self.retain_inputs((0,))
        xp = backend.get_array_module(*inputs)
        x, gy = inputs
        self._gy_shape = gy.shape
        gW = xp.zeros(self.w_shape, dtype=gy.dtype)

        if xp is numpy:
            # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is
            # too slow.
            for ix, igy in six.moves.zip(x.ravel(),
                                         gy.reshape(x.size, -1)):
                if ix == self.ignore_label:
                    continue
                gW[ix] += igy
        else:
            utils.nondeterministic('atomicAdd')
            if self.ignore_label is None:
                cuda.elementwise(
                    'T gy, S x, S n_out', 'raw T gW',
                    'ptrdiff_t w_ind[] = {x, i % n_out};'
                    'atomicAdd(&gW[w_ind], gy)',
                    'embed_id_bwd')(
                        gy, xp.expand_dims(x, -1), gW.shape[1], gW)
            else:
                cuda.elementwise(
                    'T gy, S x, S n_out, S ignore', 'raw T gW',
                    '''
                    if (x != ignore) {
                      ptrdiff_t w_ind[] = {x, i % n_out};
                      atomicAdd(&gW[w_ind], gy);
                    }
                    ''',
                    'embed_id_bwd_ignore_label')(
                        gy, xp.expand_dims(x, -1), gW.shape[1],
                        self.ignore_label, gW)
        return gW,
Exemplo n.º 11
0
    def backward_gpu(self, inputs, gy):
        utils.nondeterministic('atomicAdd')
        bottom_rois, bottom_roi_indices = inputs[1:]
        channels, height, width = self._bottom_data_shape[1:]
        bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, gy[0].dtype)

        if self.sampling_ratio[0] is None:
            sampling_ratio_h = 0
        else:
            sampling_ratio_h = self.sampling_ratio[0]
        if self.sampling_ratio[1] is None:
            sampling_ratio_w = 0
        else:
            sampling_ratio_w = self.sampling_ratio[1]

        cuda.elementwise(
            '''
            raw T top_diff, T spatial_scale,
            int32 channels, int32 height, int32 width,
            int32 pooled_height, int32 pooled_width,
            int32 sampling_ratio_h, int32 sampling_ratio_w,
            raw T bottom_rois, raw int32 bottom_roi_indices
            ''',
            'raw T bottom_diff, raw int32 argmax_data',
            '''
            // (n, c, h, w) coords in bottom data
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int n = i / pooled_width / pooled_height / channels;

            // Do not using rounding; this implementation detail is critical
            int roi_batch_ind = bottom_roi_indices[n];
            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;

            // Force malformed ROIs to be 1x1
            T roi_width = max(roi_end_w - roi_start_w, (T)1.);
            T roi_height = max(roi_end_h - roi_start_h, (T)1.);
            T bin_size_h = static_cast<T>(roi_height) /
                static_cast<T>(pooled_height);
            T bin_size_w = static_cast<T>(roi_width) /
                static_cast<T>(pooled_width);

            int bottom_diff_offset =
                (roi_batch_ind * channels + c) * height * width;

            int top_offset = (n * channels + c) * pooled_height * pooled_width;
            int max_index = argmax_data[top_offset + ph * pooled_width + pw];

            if (max_index != -1) {
                T top_diff_this_bin =
                    top_diff[top_offset + ph * pooled_width + pw];

                // We use roi_bin_grid to sample the grid and mimic integral
                int roi_bin_grid_h = (sampling_ratio_h > 0)
                    ? sampling_ratio_h
                    : ceil(roi_height / pooled_height); // e.g. = 2
                int roi_bin_grid_w = (sampling_ratio_w > 0)
                    ? sampling_ratio_w
                    : ceil(roi_width / pooled_width);

                int iy = max_index / roi_bin_grid_w;
                int ix = max_index % roi_bin_grid_w;

                T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
                T x = roi_start_w + pw * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

                // bilinear_interpolation_gradient {{
                int y_low, x_low, y_high, x_high;
                T w1, w2, w3, w4;
                bool y_ret = get_bounds(y, height, y_low, y_high);
                bool x_ret = get_bounds(x, width, x_low, x_high);
                if (!x_ret || !y_ret) continue;
                get_bilinear_interp_params(
                    y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);

                if (w1 > 0 && y_low >= 0 && x_low >= 0) {
                    T g1 = top_diff_this_bin * w1;
                    atomicAdd(&bottom_diff[
                        bottom_diff_offset + y_low * width + x_low], g1);
                }
                if (w2 > 0 && y_low >= 0 && x_high <= width - 1) {
                    T g2 = top_diff_this_bin * w2;
                    atomicAdd(&bottom_diff[
                        bottom_diff_offset + y_low * width + x_high], g2);
                }
                if (w3 > 0 && y_high <= height - 1 && x_low >= 0) {
                    T g3 = top_diff_this_bin * w3;
                    atomicAdd(&bottom_diff[
                        bottom_diff_offset + y_high * width + x_low], g3);
                }
                if (w4 > 0 && y_high <= height - 1 && x_high <= width - 1) {
                    T g4 = top_diff_this_bin * w4;
                    atomicAdd(&bottom_diff[
                        bottom_diff_offset + y_high * width + x_high], g4);
                }
            }
            // }}
            ''',
            'roi_max_align_2d_bwd',
            preamble=_GET_BILINEAR_INTERP_KERNEL,
        )(gy[0],
          self.spatial_scale,
          channels,
          height,
          width,
          self.outh,
          self.outw,
          sampling_ratio_h,
          sampling_ratio_w,
          bottom_rois,
          bottom_roi_indices,
          bottom_diff,
          self.argmax_data,
          size=gy[0].size)

        return bottom_diff, None, None
Exemplo n.º 12
0
    def backward_gpu(self, inputs, gy):
        utils.nondeterministic('atomicAdd')
        bottom_rois, bottom_roi_indices = inputs[1:]
        channels, height, width = self._bottom_data_shape[1:]
        bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, gy[0].dtype)

        cuda.elementwise(
            '''
            raw T top_diff, raw T bottom_rois, raw int32 bottom_roi_indices,
            T spatial_scale, int32 channels, int32 height, int32 width,
            int32 pooled_height, int32 pooled_width
            ''', 'raw T bottom_diff', '''
            // pos in output filter
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int n = i / pooled_width / pooled_height / channels;

            int roi_batch_ind = bottom_roi_indices[n];
            int roi_start_h = round(bottom_rois[n * 4 + 0] * spatial_scale);
            int roi_start_w = round(bottom_rois[n * 4 + 1] * spatial_scale);
            int roi_end_h = round(bottom_rois[n * 4 + 2] * spatial_scale);
            int roi_end_w = round(bottom_rois[n * 4 + 3] * spatial_scale);

            // Force malformed ROIs to be 1x1
            int roi_height = max(roi_end_h - roi_start_h, 1);
            int roi_width = max(roi_end_w - roi_start_w, 1);
            T bin_size_h = static_cast<T>(roi_height)
                           / static_cast<T>(pooled_height);
            T bin_size_w = static_cast<T>(roi_width)
                           / static_cast<T>(pooled_width);

            int hstart = static_cast<int>(floor(static_cast<T>(ph)
                                          * bin_size_h));
            int wstart = static_cast<int>(floor(static_cast<T>(pw)
                                          * bin_size_w));
            int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
                                        * bin_size_h));
            int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
                                        * bin_size_w));

            // Add roi offsets and clip to input boundaries
            hstart = min(max(hstart + roi_start_h, 0), height);
            hend = min(max(hend + roi_start_h, 0), height);
            wstart = min(max(wstart + roi_start_w, 0), width);
            wend = min(max(wend + roi_start_w, 0), width);
            bool is_empty = (hend <= hstart) || (wend <= wstart);

            int bottom_diff_offset =
                (roi_batch_ind * channels + c) * height * width;
            int top_offset =
                (n * channels + c) * pooled_height * pooled_width;

            T count = (hend - hstart) * (wend - wstart);
            T diff_val = is_empty ? 0. :
                top_diff[top_offset + ph * pooled_width + pw] / count;
            for (int h = hstart; h < hend; ++h) {
                for (int w = wstart; w < wend; ++w) {
                    int bottom_index = h * width + w;
                    atomicAdd(
                        &bottom_diff[bottom_diff_offset + bottom_index],
                        diff_val);
                }
            }
            ''', 'roi_average_pooling_2d_bwd')(gy[0],
                                               bottom_rois,
                                               bottom_roi_indices,
                                               self.spatial_scale,
                                               channels,
                                               height,
                                               width,
                                               self.outh,
                                               self.outw,
                                               bottom_diff,
                                               size=gy[0].size)

        return bottom_diff, None, None
Exemplo n.º 13
0
    def backward_gpu(self, inputs, gy):
        utils.nondeterministic('atomicAdd')
        bottom_rois, bottom_roi_indices = inputs[1:]
        channels, height, width = self._bottom_data_shape[1:]
        bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, gy[0].dtype)

        if self.sampling_ratio[0] is None:
            sampling_ratio_h = 0
        else:
            sampling_ratio_h = self.sampling_ratio[0]
        if self.sampling_ratio[1] is None:
            sampling_ratio_w = 0
        else:
            sampling_ratio_w = self.sampling_ratio[1]

        cuda.elementwise(
            '''
            raw T top_diff, T spatial_scale,
            int32 channels, int32 height, int32 width,
            int32 pooled_height, int32 pooled_width,
            int32 sampling_ratio_h, int32 sampling_ratio_w,
            raw T bottom_rois, raw int32 bottom_roi_indices
            ''',
            'raw T bottom_diff, raw int32 argmax_data',
            '''
            // (n, c, h, w) coords in bottom data
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int n = i / pooled_width / pooled_height / channels;

            // Do not using rounding; this implementation detail is critical
            int roi_batch_ind = bottom_roi_indices[n];
            T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale;
            T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale;
            T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale;
            T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale;

            // Force malformed ROIs to be 1x1
            T roi_width = max(roi_end_w - roi_start_w, (T)1.);
            T roi_height = max(roi_end_h - roi_start_h, (T)1.);
            T bin_size_h = static_cast<T>(roi_height) /
                static_cast<T>(pooled_height);
            T bin_size_w = static_cast<T>(roi_width) /
                static_cast<T>(pooled_width);

            int bottom_diff_offset =
                (roi_batch_ind * channels + c) * height * width;

            int top_offset = (n * channels + c) * pooled_height * pooled_width;
            int max_index = argmax_data[top_offset + ph * pooled_width + pw];

            if (max_index != -1) {
                T top_diff_this_bin =
                    top_diff[top_offset + ph * pooled_width + pw];

                // We use roi_bin_grid to sample the grid and mimic integral
                int roi_bin_grid_h = (sampling_ratio_h > 0)
                    ? sampling_ratio_h
                    : ceil(roi_height / pooled_height); // e.g. = 2
                int roi_bin_grid_w = (sampling_ratio_w > 0)
                    ? sampling_ratio_w
                    : ceil(roi_width / pooled_width);

                int iy = max_index / roi_bin_grid_w;
                int ix = max_index % roi_bin_grid_w;

                T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);  // e.g. 0.5, 1.5
                T x = roi_start_w + pw * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

                // bilinear_interpolation_gradient {{
                int y_low, x_low, y_high, x_high;
                T w1, w2, w3, w4;
                bool y_ret = get_bounds(y, height, y_low, y_high);
                bool x_ret = get_bounds(x, width, x_low, x_high);
                if (!x_ret || !y_ret) continue;
                get_bilinear_interp_params(
                    y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4);

                T g1 = top_diff_this_bin * w1;
                T g2 = top_diff_this_bin * w2;
                T g3 = top_diff_this_bin * w3;
                T g4 = top_diff_this_bin * w4;

                if (x_low >= 0 && x_high >= 0 &&
                        y_low >= 0 && y_high >= 0) {
                    atomicAdd(&bottom_diff[bottom_diff_offset +
                                           y_low * width + x_low], g1);
                    atomicAdd(&bottom_diff[bottom_diff_offset +
                                           y_low * width + x_high], g2);
                    atomicAdd(&bottom_diff[bottom_diff_offset +
                                           y_high * width + x_low], g3);
                    atomicAdd(&bottom_diff[bottom_diff_offset +
                                           y_high * width + x_high], g4);
                }
            }
            // }}
            ''',
            'roi_max_align_2d_bwd',
            preamble=_GET_BILINEAR_INTERP_KERNEL,
        )(gy[0], self.spatial_scale, channels, height, width,
          self.outh, self.outw, sampling_ratio_h, sampling_ratio_w,
          bottom_rois, bottom_roi_indices, bottom_diff, self.argmax_data,
          size=gy[0].size)

        return bottom_diff, None, None
Exemplo n.º 14
0
    def backward_gpu(self, inputs, gy):
        utils.nondeterministic('atomicAdd')
        bottom_rois, bottom_roi_indices = inputs[1:]
        channels, height, width = self._bottom_data_shape[1:]
        bottom_diff = cuda.cupy.zeros(
            self._bottom_data_shape, gy[0].dtype)

        cuda.elementwise(
            '''
            raw T top_diff, raw T bottom_rois, raw int32 bottom_roi_indices,
            T spatial_scale, int32 channels, int32 height, int32 width,
            int32 pooled_height, int32 pooled_width
            ''',
            'raw T bottom_diff',
            '''
            // pos in output filter
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int n = i / pooled_width / pooled_height / channels;

            int roi_batch_ind = bottom_roi_indices[n];
            int roi_start_h = round(bottom_rois[n * 4 + 0] * spatial_scale);
            int roi_start_w = round(bottom_rois[n * 4 + 1] * spatial_scale);
            int roi_end_h = round(bottom_rois[n * 4 + 2] * spatial_scale);
            int roi_end_w = round(bottom_rois[n * 4 + 3] * spatial_scale);

            // Force malformed ROIs to be 1x1
            int roi_height = max(roi_end_h - roi_start_h, 1);
            int roi_width = max(roi_end_w - roi_start_w, 1);
            T bin_size_h = static_cast<T>(roi_height)
                           / static_cast<T>(pooled_height);
            T bin_size_w = static_cast<T>(roi_width)
                           / static_cast<T>(pooled_width);

            int hstart = static_cast<int>(floor(static_cast<T>(ph)
                                          * bin_size_h));
            int wstart = static_cast<int>(floor(static_cast<T>(pw)
                                          * bin_size_w));
            int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
                                        * bin_size_h));
            int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
                                        * bin_size_w));

            // Add roi offsets and clip to input boundaries
            hstart = min(max(hstart + roi_start_h, 0), height);
            hend = min(max(hend + roi_start_h, 0), height);
            wstart = min(max(wstart + roi_start_w, 0), width);
            wend = min(max(wend + roi_start_w, 0), width);
            bool is_empty = (hend <= hstart) || (wend <= wstart);

            int bottom_diff_offset =
                (roi_batch_ind * channels + c) * height * width;
            int top_offset =
                (n * channels + c) * pooled_height * pooled_width;

            T count = (hend - hstart) * (wend - wstart);
            T diff_val = is_empty ? 0. :
                top_diff[top_offset + ph * pooled_width + pw] / count;
            for (int h = hstart; h < hend; ++h) {
                for (int w = wstart; w < wend; ++w) {
                    int bottom_index = h * width + w;
                    atomicAdd(
                        &bottom_diff[bottom_diff_offset + bottom_index],
                        diff_val);
                }
            }
            ''', 'roi_average_pooling_2d_bwd'
        )(gy[0], bottom_rois, bottom_roi_indices, self.spatial_scale,
          channels, height, width, self.outh, self.outw,
          bottom_diff, size=gy[0].size)

        return bottom_diff, None, None