def numerical_grad_gpu(f, inputs, grad_outputs, eps=1e-3): grads = tuple(cuda.zeros_like(x) for x in inputs) for x, gx in zip(inputs, grads): x = x.ravel() gx = gx.ravel() x_cpu = x.get() gx_cpu = gx.get() for i in six.moves.range(x_cpu.size): orig = x_cpu[i] x_cpu[i] = orig + eps x.set(x_cpu) ys1 = f() x_cpu[i] = orig - eps x.set(x_cpu) ys2 = f() x_cpu[i] = orig x.set(x_cpu) for y1, y2, gy in zip(ys1, ys2, grad_outputs): if gy is not None: dot = sum(((y1 - y2) * gy).ravel()).get() gx_cpu[i] += dot / (2 * eps) gx.set(gx_cpu) return grads
def backward_gpu(self, inputs, loss): x, t = inputs gloss, = loss n_in = x.shape[1] gx = cuda.zeros_like(x) cuda.elementwise( '''T wxy, raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, raw T gloss, int32 c, int32 max_length''', 'raw T gx, raw T gw', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T code = codes[p]; T g = -gloss[0] * code / (1.0 + exp(wxy)); for (int j = 0; j < c; ++j) { atomicAdd(&gx[ind * c + j], g * w[node * c + j]); atomicAdd(&gw[node * c + j], g * x[ind * c + j]); } } ''', 'binary_hierarchical_softmax_bwd' )(self.wxy, x, self.W, t, self.paths, self.codes, self.begins, gloss, n_in, self.max_length, gx, self.gW) return gx, None
def numerical_grad_gpu(f, inputs, grad_outputs, eps=1e-3): grads = tuple(cuda.zeros_like(x) for x in inputs) for x, gx in zip(inputs, grads): x = x.ravel() gx = gx.ravel() x_cpu = x.get() gx_cpu = gx.get() for i in six.moves.range(x_cpu.size): orig = x_cpu[i] x_cpu[i] = orig + eps x.set(x_cpu) ys1 = _copy_arrays(f()) x_cpu[i] = orig - eps x.set(x_cpu) ys2 = _copy_arrays(f()) x_cpu[i] = orig x.set(x_cpu) for y1, y2, gy in zip(ys1, ys2, grad_outputs): if gy is not None: dot = sum(((y1 - y2) * gy).ravel()).get() gx_cpu[i] += dot / (2 * eps) gx.set(gx_cpu) return grads
def backward(self, x, gy): if isinstance(x[0], cuda.GPUArray): gx = cuda.zeros_like(x[0]) else: gx = numpy.zeros_like(x[0]) gys = split_axis.SplitAxis(self.split_inds, axis=1).forward(gy) for pooler, gy in zip(self.poolers, gys): gy = gy.reshape(pooler.out_shape) gx += pooler.backward(x, (gy, ))[0] return gx,
def create_linear_chain(self, length, gpu): if gpu: x = chainer.Variable(cuda.to_gpu(self.x)) else: x = chainer.Variable(self.x) ret = [x] for i in six.moves.range(length): ret.append(constant((ret[i], ), (self.a, ))) if gpu: ret[-1].grad = cuda.zeros_like(ret[-1].data) else: ret[-1].grad = np.zeros_like(ret[-1].data) return ret
def backward_gpu(self, x, gys): gx = cuda.zeros_like(x[0]) coffset = 0 kernel = cuda.elementwise( _args, 'COPY(x[idx] = y[i])', 'split_bwd', preamble=_preamble) for gy in gys: if gy is None: continue cdimy = gy.shape[self.axis] if cdimy != 0: kernel(gy, gx, cdimy, self.cdimx, self.rdim, coffset) coffset += cdimy return gx,
def backward_gpu(self, inputs, grads): x, t = inputs gloss, = grads n_in = x.shape[1] g = cuda.empty_like(self.wx) cuda.elementwise( 'float* g, const float* wx, const float* gloss, int m', ''' float y; if (i % m == 0) { y = 1; } else { y = -1; } g[i] = -y * *gloss / (1.0f + __expf(wx[i] * y)); ''', 'negative_sampling_calculate_g' )(g, self.wx, gloss, self.sample_size + 1) gx = cuda.zeros_like(x) cuda.elementwise( '''float* gx, const float* g, const float* W, const int* k, int c, int m''', ''' int d = i / c; g = &g[d * m]; k = &k[d * m]; float w = 0; for (int j = 0; j < m; ++j) { w += g[j] * W[k[j] * c + i % c]; } gx[i] = w; ''', 'negative_sampling_calculate_gx' )(gx, g, self.W, self.samples, n_in, self.sample_size + 1) cuda.elementwise( '''const float * g, const float* x, const int* k, float* gW, int c, int m''', ''' x = &x[(i / m) * c]; gW = &gW[k[i] * c]; float gi = g[i]; for (int j = 0; j < c; ++j) { atomicAdd(gW + j, gi * x[j]); } ''', 'negative_sampling_calculate_gw' )(g, x, self.samples, self.gW, n_in, self.sample_size + 1) return gx, None
def backward_gpu(self, x, gys): gx = cuda.zeros_like(x[0]) coffset = 0 kernel = cuda.elementwise(_args, 'COPY(x[idx] = y[i])', 'split_bwd', preamble=_preamble) for gy in gys: if gy is None: continue cdimy = gy.shape[self.axis] if cdimy != 0: kernel(gy, gx, cdimy, self.cdimx, self.rdim, coffset) coffset += cdimy return gx,
def backward_gpu(self, inputs, grads): x, t = inputs gloss, = grads n_in = x.shape[1] g = cuda.empty_like(self.wx) cuda.elementwise( 'float* g, const float* wx, const float* gloss, int m', ''' float y; if (i % m == 0) { y = 1; } else { y = -1; } g[i] = -y * *gloss / (1.0f + __expf(wx[i] * y)); ''', 'negative_sampling_calculate_g')(g, self.wx, gloss, self.sample_size + 1) gx = cuda.zeros_like(x) cuda.elementwise( '''float* gx, const float* g, const float* W, const int* k, int c, int m''', ''' int d = i / c; g = &g[d * m]; k = &k[d * m]; float w = 0; for (int j = 0; j < m; ++j) { w += g[j] * W[k[j] * c + i % c]; } gx[i] = w; ''', 'negative_sampling_calculate_gx')(gx, g, self.W, self.samples, n_in, self.sample_size + 1) cuda.elementwise( '''const float * g, const float* x, const int* k, float* gW, int c, int m''', ''' x = &x[(i / m) * c]; gW = &gW[k[i] * c]; float gi = g[i]; for (int j = 0; j < c; ++j) { atomicAdd(gW + j, gi * x[j]); } ''', 'negative_sampling_calculate_gw')(g, x, self.samples, self.gW, n_in, self.sample_size + 1) return gx, None
def backward_gpu(self, inputs, grads): x, t = inputs gloss, = grads n_in = x.shape[1] g = cuda.elementwise( 'T wx, raw T gloss, int32 m', 'T g', ''' T y; if (i % m == 0) { y = 1; } else { y = -1; } g = -y * gloss[0] / (1.0f + __expf(wx * y)); ''', 'negative_sampling_calculate_g' )(self.wx, gloss, self.sample_size + 1) gx = cuda.zeros_like(x) cuda.elementwise( 'raw T g, raw T W, raw S k, int32 c, int32 m', 'T gx', ''' int d = i / c; T w = 0; for (int j = 0; j < m; ++j) { w += g[d * m + j] * W[k[d * m + j] * c + i % c]; } gx = w; ''', 'negative_sampling_calculate_gx' )(g, self.W, self.samples, n_in, self.sample_size + 1, gx) cuda.elementwise( 'T g, raw T x, S k, int32 c, int32 m', 'raw T gW', ''' T gi = g; for (int j = 0; j < c; ++j) { atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]); } ''', 'negative_sampling_calculate_gw' )(g, x, self.samples, n_in, self.sample_size + 1, self.gW) return gx, None
def forward_gpu(self, inputs): x, t = inputs n_in = x.shape[1] self._make_samples(t) wx = cuda.empty((x.shape[0], self.sample_size + 1)) cuda.elementwise( '''float* wx, const float* W, const float* x, const int* k, int c, int m''', ''' x = &x[(i / m) * c]; W = &W[k[i] * c]; float f = 0; for (int j = 0; j < c; ++j) { f += x[j] * W[j]; } wx[i] = f; ''', 'negative_sampling_wx' )(wx, self.W, x, self.samples, n_in, self.sample_size + 1) self.wx = wx y = cuda.zeros_like(wx) cuda.elementwise( 'float* y, const float* wx, int c, int m', ''' float f = wx[i]; if (i % m == 0) { f = -f; } float loss; if (f < 0) { loss = __logf(1 + __expf(f)); } else { loss = f + __logf(1 + __expf(-f)); } y[i] = loss; ''', 'negative_sampling_forward' )(y, wx, n_in, self.sample_size + 1) loss = cuda.gpuarray.sum(y) return loss,
def forward_gpu(self, inputs): x, t = inputs n_in = x.shape[1] self._make_samples(t) wx = cuda.empty((x.shape[0], self.sample_size + 1)) cuda.elementwise( '''float* wx, const float* W, const float* x, const int* k, int c, int m''', ''' x = &x[(i / m) * c]; W = &W[k[i] * c]; float f = 0; for (int j = 0; j < c; ++j) { f += x[j] * W[j]; } wx[i] = f; ''', 'negative_sampling_wx')(wx, self.W, x, self.samples, n_in, self.sample_size + 1) self.wx = wx y = cuda.zeros_like(wx) cuda.elementwise( 'float* y, const float* wx, int c, int m', ''' float f = wx[i]; if (i % m == 0) { f = -f; } float loss; if (f < 0) { loss = __logf(1 + __expf(f)); } else { loss = f + __logf(1 + __expf(-f)); } y[i] = loss; ''', 'negative_sampling_forward')(y, wx, n_in, self.sample_size + 1) loss = cuda.gpuarray.sum(y) return loss,
def backward_gpu(self, inputs, loss): x, t = inputs gloss, = loss n_in = x.shape[1] gx = cuda.zeros_like(x) cuda.elementwise( '''const float* wxy, float* gx, float* gw, const float* x, const float* w, const int* ts, const int* paths, const float* codes, const int* begins, const float* gloss, int c, int max_length''', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; float code = codes[p]; gx = &gx[ind * c]; x = &x[ind * c]; float g = -*gloss * code / (1.0 + exp(wxy[i])); for (int j = 0; j < c; ++j) { atomicAdd(gx + j, g * w[node * c + j]); atomicAdd(gw + node * c + j, g * x[j]); } } ''', 'binary_hierarchical_softmax_bwd' )(self.wxy, gx, self.gW, x, self.W, t, self.paths, self.codes, self.begins, gloss, n_in, self.max_length) return gx, None
def init_state_gpu(self, param, grad): return cuda.zeros_like(param), cuda.zeros_like(param)
def init_state_gpu(self, param, grad): return cuda.zeros_like(param)
def _zeros_like(x): if isinstance(x, numpy.ndarray): return numpy.zeros_like(x) else: return cuda.zeros_like(x)
def init_state_gpu(self, param, grad): n = cuda.zeros_like(param) g = cuda.zeros_like(param) delta = cuda.zeros_like(param) return n, g, delta