def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, dim = self.rng.random_integers(2000, size=2) y_hat = self.rng.randn(batch_size, dim).astype(dtype=np.float32) y = self.rng.randn(batch_size, dim).astype(dtype=np.float32) # Theano model th_y_hat, th_y = T.fmatrix(), T.fmatrix() loss = T.mean(T.sum((th_y_hat - th_y) ** 2, axis=1)) get_theano_grads = theano.function([th_y_hat, th_y], T.grad(loss, wrt=th_y_hat)) th_dL_dy_hat = get_theano_grads(y_hat, y) # quagga model context = Context() y_hat_gpu = Connector(Matrix.from_npa(y_hat), context, context) y_gpu = Connector(Matrix.from_npa(y)) sigmoid_ce_block = SseBlock(y_hat_gpu, y_gpu) sigmoid_ce_block.fprop() sigmoid_ce_block.bprop() q_dL_dy_hat = y_hat_gpu.backward_matrix.to_host() r.append(np.allclose(th_dL_dy_hat, q_dL_dy_hat)) self.assertEqual(sum(r), self.N)
class SequentialMeanPoolingBlock(object): # TODO(sergii): change sequentially_tile to add_sequentially_tile, because can erase gradients def __init__(self, matrices, device_id=None): self.context = Context(device_id) device_id = self.context.device_id self.output = Matrix.empty_like(matrices[0], device_id) learning = matrices[0].bpropagable self.output = Connector(self.output, device_id if learning else None) if learning: self.matrices, self.dL_dmatrices = izip( *matrices.register_usage(device_id, device_id)) else: self.matrices = matrices.register_usage(device_id) self.length = matrices.length def fprop(self): self.output.assign_sequential_mean_pooling(self.context, self.matrices[:self.length]) self.output.fprop() def bprop(self): dL_doutput = self.output.backward_matrix dL_doutput.scale(self.context, ct.c_float(1.0 / self.length)) Matrix.sequentially_tile(self.context, dL_doutput, self.dL_dmatrices[:self.length])
def __init__(self, W, b, x, device_id=None): self.f_context = Context(device_id) device_id = self.f_context.device_id if W.bpropagable: self.W, self.dL_dW = W.register_usage(device_id, device_id) else: self.W = W.register_usage(device_id) if b: if b.bpropagable: self.b, self.dL_db = b.register_usage(device_id, device_id) self.ones = Matrix.empty(x.nrows, 1, self.b.dtype, device_id) self.ones.sync_fill(1.0) else: self.b = b.register_usage(device_id) if x.bpropagable: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) output = Matrix.empty(x.nrows, self.W.ncols, device_id=device_id) self.learning = hasattr(self, 'dL_dW') or hasattr(self, 'dL_db') or \ hasattr(self, 'dL_dx') if self.learning: self.b_context = Context(device_id) self.output = Connector(output, device_id) else: self.output = Connector(output)
def __init__(self, matrix, axis=1, device_id=None): self.context = Context(device_id) self._ctype = matrix.c_dtype self._zero = self._ctype(0.0) if axis == 0: self._ones = Matrix.empty(1, matrix.nrows, matrix.dtype, device_id) self.output = Matrix.empty(1, matrix.ncols, matrix.dtype, device_id) self.alpha = self._ctype(1.0 / matrix.nrows) elif axis == 1: self._ones = Matrix.empty(matrix.ncols, 1, matrix.dtype, device_id) self.output = Matrix.empty(matrix.nrows, 1, matrix.dtype, device_id) self.alpha = None else: raise ValueError('Invalid axis!') self._ones.sync_fill(1.0) self.axis = axis if matrix.bpropagable: self.matrix, self.dL_dmatrix = matrix.register_usage( self.context, self.context) self.output = Connector(self.output, self.context, self.context) else: self.matrix = matrix.register_usage(self.context) self.output = Connector(self.output, self.context)
class ColSlicingBlock(object): """ Parameters ---------- W : Matrix (GpuMatrix or CpuMatrix) col_indexes """ def __init__(self, W, col_indexes): device_id = W.device_id self.context = Context(device_id) learning = W.bpropagable if learning: self.W, self.dL_dW = W.register_usage_with_sparse_backward_matrix() else: self.W = W.register_usage(device_id) self.col_indexes = col_indexes.register_usage(device_id) output = Matrix.empty(W.nrows, col_indexes.ncols, device_id=device_id) self.output = Connector(output, device_id if learning else None) def fprop(self): self.W.slice_columns(self.context, self.col_indexes, self.output) self.output.fprop() def bprop(self): if hasattr(self, 'dL_dW'): self.dL_dW.add_columns_slice(self.col_indexes, self.output.bprop())
class ArgmaxBlock(object): """ Determines argmax values along the specified ``axis`` in the input matrix. The block returns a vector (matrix with one of its dimensions equals 1) of argmax values. Parameters ---------- x : Matrix (GpuMatrix or CpuMatrix) Block's input axis : int Axis along which argmax is determined device_id : int Defines the device's id on which the computation will take place Returns ------- vector A vector containing argmax values (e.g. argmax for each row if axis == 1). """ def __init__(self, x, axis, device_id=None): if axis != 1: raise NotImplementedError self.axis = axis self.context = Context(device_id) device_id = self.context.device_id self.x = x.register_usage(device_id) self.output = Connector(Matrix.empty(x.nrows, 1, x.dtype, device_id)) def fprop(self): self.x.argmax(self.context, self.output, self.axis) self.output.fprop()
class LastSelectorBlock(object): """ TODO(igor). Parameters ---------- x : Matrix (GpuMatrix or CpuMatrix) """ def __init__(self, x): device_id = x[0].device_id learning = x[0].bpropagable self.context = Context(device_id) self.output = Matrix.empty_like(x[0]) self.output = Connector(self.output, device_id if learning else None) if learning: self.x, self.dL_dx = izip(*x.register_usage(device_id, device_id)) else: self.x = x.register_usage(device_id) self.last_idx = x.length - 1 def fprop(self): self.output.assign(self.context, self.x[self.last_idx]) self.output.fprop() def bprop(self): self.dL_dx[self.last_idx].add(self.context, self.output.backward_matrix)
def __init__(self, train_data, valid_data, batch_size, word_dropout_prob, device_id): self.train_data = HomogeneousDataIterator(train_data, batch_size, randomize=True, infinite=True) self.valid_data = HomogeneousDataIterator(valid_data, batch_size) self.train_data_iterator = iter(self.train_data) self.valid_data_iterator = iter(self.valid_data) self.word_keep_prob = 1.0 - word_dropout_prob self.rnd = RandomState(47571) self.unk_idx = word_to_idx['<UNK>'] self.context = Context(device_id) c = Counter([len(line) for line in chain(train_data, valid_data)]) print c.most_common() max_len = max([len(line) for line in chain(train_data, valid_data)]) self.enc_x = Connector(Matrix.empty(batch_size, max_len, 'int', device_id)) self.enc_lengths = Matrix.empty(self.enc_x.nrows, 1, 'int', device_id) self._enc_mask = Matrix.empty(self.enc_x.nrows, self.enc_x.ncols, 'float', device_id) self.enc_mask = List([Connector(self._enc_mask[:, i]) for i in xrange(max_len)], self.enc_x.ncols) self.dec_x = Connector(Matrix.empty(batch_size, max_len + 1, 'int', device_id)) self._dec_y = Matrix.empty(batch_size, max_len + 1, 'int', device_id) self.dec_y = List([Connector(self._dec_y[:, i]) for i in xrange(max_len + 1)], self._dec_y.ncols) self.dec_lengths = Matrix.empty(self.dec_x.nrows, 1, 'int', device_id) self._dec_mask = Matrix.empty(self.dec_x.nrows, self.dec_x.ncols, 'float', device_id) self.dec_mask = List([Connector(self._dec_mask[:, i]) for i in xrange(max_len + 1)], self.dec_x.ncols) self.blocking_contexts = None self.training_mode = True
def __init__(self, data, char_to_idx, batch_size, x_device_id, y_device_id): self.data = HomogeneousDataIterator(data, char_to_idx, batch_size, True, True) self.data_iterator = iter(self.data) self.x_context = Context(x_device_id) self.y_context = Context(y_device_id) max_len = 0 for sub_line in data: cur_len = len(sub_line) if cur_len > max_len: max_len = cur_len print max_len self.x = Connector( Matrix.empty(batch_size, max_len - 1, 'int', x_device_id)) self._y = Matrix.empty(batch_size, max_len - 1, 'int', y_device_id) self.y = List([Connector(self._y[:, i]) for i in xrange(max_len - 1)], self.x.ncols) self.lengths = Matrix.empty(self.x.nrows, 1, 'int', x_device_id) self._mask = Matrix.empty(self.x.nrows, self.x.ncols, 'float', x_device_id) self.mask = List( [Connector(self._mask[:, i]) for i in xrange(max_len)], self.x.ncols) self.blocking_contexts = None
class RepeatBlock(object): def __init__(self, x, repeats, axis=None, device_id=None): self.context = Context(device_id) device_id = self.context.device_id self.repeats = repeats self.axis = axis learning = x.bpropagable if learning: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) if axis == 0: self.output = Matrix.empty(x.nrows * repeats, x.ncols, x.dtype, device_id) elif axis == 1: self.output = Matrix.empty(x.nrows, x.ncols * repeats, x.dtype, device_id) else: raise ValueError('TODO') self.output = Connector(self.output, device_id if learning else None) def fprop(self): self.output.assign_repeat(self.context, self.x, self.repeats, self.axis) self.output.fprop() def bprop(self): if hasattr(self, 'dL_dx'): self.dL_dx.add_repeat_derivative(self.context, self.output.backward_matrix, self.repeats, self.axis)
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, dim = self.rng.random_integers(2000, size=2) y_hat = self.rng.randn(batch_size, dim).astype(dtype=np.float32) y = self.rng.randn(batch_size, dim).astype(dtype=np.float32) quagga.processor_type = 'gpu' context = Context() y_hat_gpu = Connector(Matrix.from_npa(y_hat), context, context) y_gpu = Connector(Matrix.from_npa(y)) sse_block = SseBlock(y_hat_gpu, y_gpu) sse_block.fprop() sse_block.bprop() dL_dy_hat_gpu = y_hat_gpu.backward_matrix.to_host() quagga.processor_type = 'cpu' context = Context() y_hat_cpu = Connector(Matrix.from_npa(y_hat), context, context) y_cpu = Connector(Matrix.from_npa(y)) sse_block = SseBlock(y_hat_cpu, y_cpu) sse_block.fprop() sse_block.bprop() dL_dy_hat_cpu = y_hat_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dy_hat_gpu, dL_dy_hat_cpu)) self.assertEqual(sum(r), self.N)
def __init__(self, x, nonlinearity, device_id=None): """ """ self.f_context = Context(device_id) device_id = self.f_context.device_id self.learning = x.bpropagable if self.learning: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) self._df_dpref = Matrix.empty_like(self.x, device_id) else: self.x = x.register_usage(device_id) output = Matrix.empty_like(x, device_id) self.output = Connector(output, device_id if self.learning else None) if nonlinearity == 'sigmoid': self.f = self.x.sigmoid elif nonlinearity == 'tanh': self.f = self.x.tanh elif nonlinearity == 'relu': self.f = self.x.relu elif nonlinearity == 'softmax': raise ValueError('For softmax nonlinearity use SoftmaxBlock!') else: raise ValueError('TODO!') self.training_mode = True
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) smean_pooling_block_gpu.fprop() output_gpu = smean_pooling_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) smean_pooling_block_cpu.fprop() output_cpu = smean_pooling_block_cpu.output.to_host() r.append(np.allclose(output_gpu, output_cpu)) self.assertEqual(sum(r), self.N)
class NonlinearityBlock(object): """ Applies nonlinear functions (``sigmoid``, ``tahn``, ``relu``) on input. Parameters ---------- x : Matrix (GpuMatrix or CpuMatrix) nonlinearity : string device_id : int """ def __init__(self, x, nonlinearity, device_id=None): """ """ self.f_context = Context(device_id) device_id = self.f_context.device_id self.learning = x.bpropagable if self.learning: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) self._df_dpref = Matrix.empty_like(self.x, device_id) else: self.x = x.register_usage(device_id) output = Matrix.empty_like(x, device_id) self.output = Connector(output, device_id if self.learning else None) if nonlinearity == "sigmoid": self.f = self.x.sigmoid elif nonlinearity == "tanh": self.f = self.x.tanh elif nonlinearity == "relu": self.f = self.x.relu elif nonlinearity == "softmax": raise ValueError("For softmax nonlinearity use SoftmaxBlock!") else: raise ValueError("TODO!") self.training_mode = True @property def df_dpref(self): if self.training_mode and self.learning: return self._df_dpref def fprop(self): self.f(self.f_context, self.output, self.df_dpref) self.output.fprop() def bprop(self): if hasattr(self, "dL_dx"): # dL/dpref = dL/df .* df/dpref dL_df = self.output.backward_matrix self.dL_dx.add_hprod(self.b_context, dL_df, self.df_dpref) def set_training_mode(self): self.training_mode = True def set_testing_mode(self): self.training_mode = False
class NonlinearityBlock(object): """ Applies nonlinear functions (``sigmoid``, ``tahn``, ``relu``) on input. Parameters ---------- x : Matrix (GpuMatrix or CpuMatrix) nonlinearity : string device_id : int """ def __init__(self, x, nonlinearity, device_id=None): """ """ self.f_context = Context(device_id) device_id = self.f_context.device_id self.learning = x.bpropagable if self.learning: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) self._df_dpref = Matrix.empty_like(self.x, device_id) else: self.x = x.register_usage(device_id) output = Matrix.empty_like(x, device_id) self.output = Connector(output, device_id if self.learning else None) if nonlinearity == 'sigmoid': self.f = self.x.sigmoid elif nonlinearity == 'tanh': self.f = self.x.tanh elif nonlinearity == 'relu': self.f = self.x.relu elif nonlinearity == 'softmax': raise ValueError('For softmax nonlinearity use SoftmaxBlock!') else: raise ValueError('TODO!') self.training_mode = True @property def df_dpref(self): if self.training_mode and self.learning: return self._df_dpref def fprop(self): self.f(self.f_context, self.output, self.df_dpref) self.output.fprop() def bprop(self): if hasattr(self, 'dL_dx'): # dL/dpref = dL/df .* df/dpref dL_df = self.output.backward_matrix self.dL_dx.add_hprod(self.b_context, dL_df, self.df_dpref) def set_training_mode(self): self.training_mode = True def set_testing_mode(self): self.training_mode = False
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_gpu.output.register_usage( context, context) smean_pooling_block_gpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_gpu.bprop() dL_dmatrices_gpu = [e.backward_matrix.to_host() for e in x_gpu] self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_cpu.output.register_usage( context, context) smean_pooling_block_cpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_cpu.bprop() dL_dmatrices_cpu = [e.backward_matrix.to_host() for e in x_cpu] for dL_dmatrix_gpu, dL_dmatrix_cpu in izip(dL_dmatrices_gpu, dL_dmatrices_cpu): if not np.allclose(dL_dmatrix_gpu, dL_dmatrix_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
class SoftmaxCeBlock(object): """ Softmax nonlinearity with mean cross entropy loss """ def __init__(self, x, true_labels, mask=None, device_id=None): self.context = Context(device_id) device_id = self.context.device_id if x.bpropagable: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.true_labels = true_labels.register_usage(device_id) if mask: self.mask = mask.register_usage(device_id) self.probs = Connector(Matrix.empty_like(self.x)) self.loss = None def fprop(self): self.x.softmax(self.context, self.probs) self.probs.fprop() def bprop(self): if not hasattr(self, 'dL_dx'): return # error = (probs - true_labels) / M if self.true_labels.dtype == 'int': self.dL_dx.add_softmax_ce_derivative(self.context, self.probs, self.true_labels) else: self.dL_dx.add_scaled_subtraction(self.context, 1. / self.probs.nrows, self.probs, self.true_labels) if hasattr(self, 'mask'): self.dL_dx.hprod(self.context, self.mask) def calculate_loss(self, context): true_labels_np = self.true_labels.to_host(context) probs_np = self.probs.to_host(context) if hasattr(self, 'mask'): mask = self.mask.to_host(context) context.add_callback(self._calculate_ce_loss, true_labels_np, probs_np, mask) else: context.add_callback(self._calculate_ce_loss, true_labels_np, probs_np) def _calculate_ce_loss(self, true_labels_np, probs_np, mask=None): if self.true_labels.dtype == 'int': idxs = range(probs_np.shape[0]), true_labels_np.flatten() logs = np.log(probs_np[idxs] + 1e-20) else: logs = np.log(np.sum(true_labels_np * probs_np, axis=1) + 1e-20) if mask is not None: logs *= mask[:, 0] self.loss = -np.sum(logs) / np.sum(mask) else: self.loss = -np.mean(logs)
def __init__(self, x, axis, device_id=None): if axis != 1: raise NotImplementedError self.axis = axis self.context = Context(device_id) device_id = self.context.device_id self.x = x.register_usage(device_id) self.output = Connector(Matrix.empty(x.nrows, 1, x.dtype, device_id))
class PtbMiniBatchesGenerator(object): def __init__(self, ptb_train, ptb_valid, batch_size, sentence_max_len, device_id): self.blocking_contexts = None self.context = Context(device_id) device_id = self.context.device_id self.train_offsets = HomogeneousDataGenerator(ptb_train, batch_size, sentence_max_len, randomize=True, infinite=True) self.valid_offsets = HomogeneousDataGenerator(ptb_valid, batch_size, sentence_max_len) train_sentences = np.array([self.train_offsets.flatten_sentences]) valid_sentences = np.array([self.valid_offsets.flatten_sentences]) self.train_sents = Matrix.from_npa(train_sentences, 'int', device_id) self.valid_sents = Matrix.from_npa(valid_sentences, 'int', device_id) self._sent_lengths = np.empty((batch_size, 1), dtype=np.int32, order='F')[...] self.sent_lengths = Matrix.from_npa(self._sent_lengths, device_id=device_id) sentence_batch = Matrix.empty(batch_size, sentence_max_len, 'int', device_id) self.sentence_batch = Connector(sentence_batch, self.context) self.sentence_batch.sync_fill(0) self._mask = Matrix.empty(sentence_batch.nrows, self.sentence_batch.ncols, 'float', device_id) self.mask = List([Connector(self._mask[:, i]) for i in xrange(sentence_max_len)], self.sentence_batch.ncols) self.train_offsets_iterator = iter(self.train_offsets) self.valid_offsets_iterator = iter(self.valid_offsets) self.training_mode = True def set_training_mode(self): self.training_mode = True def set_testing_mode(self): self.training_mode = False def fprop(self): if self.training_mode: offsets = next(self.train_offsets_iterator) sents = self.train_sents else: try: offsets = next(self.valid_offsets_iterator) sents = self.valid_sents except StopIteration as e: self.valid_offsets_iterator = iter(self.valid_offsets) raise e self.context.wait(*self.blocking_contexts) self._sent_lengths = self._sent_lengths.base[:len(offsets)] self.sentence_batch.nrows = len(offsets) for k, offset in enumerate(offsets): self.sentence_batch[k].assign(self.context, sents[:, offset[0]:offset[1]]) self._sent_lengths[k] = offset[1] - offset[0] max_sent_len = int(np.max(self._sent_lengths)) self.sentence_batch.last_modification_context = self.context self.sentence_batch.ncols = max_sent_len self.sent_lengths.assign_npa(self.context, self._sent_lengths) self._mask.mask_column_numbers_row_wise(self.context, self.sent_lengths) for e in self.mask: e.last_modification_context = self.context self.sentence_batch.fprop() self.mask.fprop()
def __init__(self, probs, true_labels, schedule, seed, device_id=None): self.schedule = schedule self.rnd = np.random.RandomState(seed) self.context = Context(device_id) device_id = self.context.device_id self.probs = probs.register_usage(device_id) self.true_labels = true_labels.register_usage(device_id) self.output = Connector(Matrix.empty_like(self.true_labels))
def test_fprop_matrix(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qW = Connector(Matrix.from_npa(W)) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() output[processor_type] = row_slicing_block.output.to_host() for output_gpu, output_cpu in izip(output['gpu'], output['cpu']): r.append(np.allclose(output_gpu, output_cpu)) self.assertEqual(sum(r), len(r))
def test_theano_fprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qW = Connector(Matrix.from_npa(W)) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() q_output = row_slicing_block.output.to_host() th_row_idxs = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) th_output = theano.function([th_row_idxs], toutput)(row_idxs) for i in xrange(sequence_len): r.append(np.allclose(q_output[i], th_output[i])) self.assertEqual(sum(r), len(r))
def test_bprop_vector(self): r = [] for _ in xrange(self.N): embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32) true_labels = self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = Connector(Matrix.from_npa(true_labels)) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) sce_block = SoftmaxCeBlock(row_slicing_block.output, qtrue_labels) qW.fprop() qrow_idxs.fprop() row_slicing_block.fprop() sce_block.fprop() sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_bprop(self): r = [] for i in xrange(self.N): matrices = [] ncols = self.rng.random_integers(1, 3000) nrows = [0] row_slices = [] device_ids = [] for _ in xrange(self.rng.random_integers(1, 10)): _nrows = self.rng.random_integers(1, 2000) nrows.append(nrows[-1] + _nrows) if self.rng.choice([True, False]): device_ids.append(0) row_slices.append((nrows[-2], nrows[-1])) else: device_ids.append(None) matrices.append( self.rng.rand(_nrows, ncols).astype(np.float32)) true_labels = self.rng.randint(ncols, size=(nrows[-1], 1)).astype(np.int32) if not row_slices: r.append(True) continue output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qmatrices = [ Connector(Matrix.from_npa(m), d_id) for m, d_id in izip(matrices, device_ids) ] qtrue_labels = Connector(Matrix.from_npa(true_labels)) vstack_block = VerticalStackBlock(*qmatrices) sce_block = SoftmaxCeBlock(vstack_block.output, qtrue_labels) for m in qmatrices: m.fprop() qtrue_labels.fprop() vstack_block.fprop() sce_block.fprop() sce_block.bprop() vstack_block.bprop() output[processor_type] = [ m.backward_matrix.to_host() for m in qmatrices if m.bpropagable ] for dL_dm_gpu, dL_dm_cpu in izip(output['gpu'], output['cpu']): if not np.allclose(dL_dm_gpu, dL_dm_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def __init__(self, W, col_indexes): device_id = W.device_id self.context = Context(device_id) learning = W.bpropagable if learning: self.W, self.dL_dW = W.register_usage_with_sparse_backward_matrix() else: self.W = W.register_usage(device_id) self.col_indexes = col_indexes.register_usage(device_id) output = Matrix.empty(W.nrows, col_indexes.ncols, device_id=device_id) self.output = Connector(output, device_id if learning else None)
def __init__(self, R, b, grad_clipping, mask, prev_c, prev_h, device_id=None): self.f_context = Context(device_id) device_id = self.f_context.device_id if R.bpropagable: self.R, self.dL_dR = R.register_usage(device_id, device_id) self.R_b_context = Context(device_id) else: self.R = R.register_usage(device_id) if b.bpropagable: self.b, self.dL_db = b.register_usage(device_id, device_id) self.b_b_context = Context(device_id) else: self.b = b.register_usage(device_id) self.grad_clipping = grad_clipping if mask: self.mask = mask.register_usage(device_id) if prev_c.bpropagable: self.prev_c, self.dL_dprev_c = prev_c.register_usage(device_id, device_id) else: self.prev_c = prev_c.register_usage(device_id) if prev_h.bpropagable: self.prev_h, self.dL_dprev_h = prev_h.register_usage(device_id, device_id) else: self.prev_h = prev_h.register_usage(device_id) self.learning = R.bpropagable or prev_c.bpropagable or prev_h.bpropagable if self.learning: self.b_context = Context(device_id) dim = self.R.nrows batch_size = self.prev_c.nrows self.zifo = Matrix.empty(batch_size, 4 * dim, device_id=device_id) self.z = self.zifo[:, 0*dim:1*dim] self.i = self.zifo[:, 1*dim:2*dim] self.f = self.zifo[:, 2*dim:3*dim] self.o = self.zifo[:, 3*dim:4*dim] self.c = Matrix.empty_like(self.prev_c, device_id) self.c = Connector(self.c, device_id if self.learning else None) self.tanh_c = Matrix.empty_like(self.c, device_id) self.h = Matrix.empty_like(self.c, device_id) self.h = Connector(self.h, device_id if self.learning else None) if self.learning: self._dzifo_dpre_zifo = Matrix.empty_like(self.zifo) self.dz_dpre_z = self._dzifo_dpre_zifo[:, 0*dim:1*dim] self.di_dpre_i = self._dzifo_dpre_zifo[:, 1*dim:2*dim] self.df_dpre_f = self._dzifo_dpre_zifo[:, 2*dim:3*dim] self.do_dpre_o = self._dzifo_dpre_zifo[:, 3*dim:4*dim] self.dL_dpre_zifo = self._dzifo_dpre_zifo self.dL_dpre_z = self.dz_dpre_z self.dL_dpre_i = self.di_dpre_i self.dL_dpre_f = self.df_dpre_f self.dL_dpre_o = self.do_dpre_o self._dtanh_c_dc = Matrix.empty_like(self.c)
def __init__(self, x): device_id = x[0].device_id learning = x[0].bpropagable self.context = Context(device_id) self.output = Matrix.empty_like(x[0]) self.output = Connector(self.output, device_id if learning else None) if learning: self.x, self.dL_dx = izip(*x.register_usage(device_id, device_id)) else: self.x = x.register_usage(device_id) self.last_idx = x.length - 1
class DropoutBlock(object): """ Sets elements of input matrix ``x`` to zero with probability ``dropout_prob`` in training mode. Scales ``x`` by factor of ``1-dropout_prob`` during in testing mode. Parameters ---------- dropout_prob : float x : :class:`~quagga.matrix.CpuMatrix` or :class:`~quagga.matrix.GpuMatrix` seed : int device_id : int Defines the device's id on which the computation will take place Notes ----- The dropout block is a regularizer that randomly sets input values to zero in training mode. This procedure is supposed to improve generalization. During testing, the dropout block scales input values. """ def __init__(self, dropout_prob, x, seed=42, device_id=None): self.dropout_prob = dropout_prob self.f_context = Context(device_id) device_id = self.f_context.device_id self.generator = Matrix.get_random_generator(seed) if x.bpropagable: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.output = Matrix.empty_like(self.x) self.output = Connector(self.output, device_id if x.bpropagable else None) self.training_mode = True def fprop(self): if self.training_mode: self.x.dropout(self.f_context, self.generator, self.dropout_prob, self.output) else: self.x.scale(self.f_context, 1.0 - self.dropout_prob, self.output) self.output.fprop() def bprop(self): if hasattr(self, 'dL_dx') and self.training_mode: dL_doutput = self.output.backward_matrix self.dL_dx.add_mask_zeros(self.b_context, dL_doutput, self.output) def set_training_mode(self): self.training_mode = True def set_testing_mode(self): self.training_mode = False
def __init__(self, **kwargs): self.parameters = {} self.trainable_parameters = {} for name, definition in kwargs.iteritems(): device_id = definition['device_id'] matrix = Matrix.from_npa(definition['init'](), device_id=device_id) if 'trainable' not in definition or definition['trainable']: param = Connector(matrix, device_id) self.trainable_parameters[name] = param else: param = Connector(matrix) self.parameters[name] = param
def __init__(self, x, true_labels, mask=None, device_id=None): self.context = Context(device_id) device_id = self.context.device_id if x.bpropagable: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.true_labels = true_labels.register_usage(device_id) if mask: self.mask = mask.register_usage(device_id) self.probs = Connector(Matrix.empty_like(self.x)) self.loss = None
def __init__(self, matrices, device_id=None): self.context = Context(device_id) device_id = self.context.device_id self.output = Matrix.empty_like(matrices[0], device_id) learning = matrices[0].bpropagable self.output = Connector(self.output, device_id if learning else None) if learning: self.matrices, self.dL_dmatrices = izip( *matrices.register_usage(device_id, device_id)) else: self.matrices = matrices.register_usage(device_id) self.length = matrices.length
class SoftmaxCeBlock(object): """ Softmax nonlinearity with mean cross entropy loss """ def __init__(self, x, true_labels, mask=None, device_id=None): self.context = Context(device_id) device_id = self.context.device_id if x.bpropagable: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.true_labels = true_labels.register_usage(device_id) if mask: self.mask = mask.register_usage(device_id) self.probs = Connector(Matrix.empty_like(self.x)) self.loss = None def fprop(self): self.x.softmax(self.context, self.probs) self.probs.fprop() def bprop(self): if not hasattr(self, 'dL_dx'): return # error = (probs - true_labels) / M if self.true_labels.dtype == 'int': self.dL_dx.add_softmax_ce_derivative(self.context, self.probs, self.true_labels) else: self.dL_dx.add_scaled_subtraction(self.context, 1. / self.probs.nrows, self.probs, self.true_labels) if hasattr(self, 'mask'): self.dL_dx.hprod(self.context, self.mask) def calculate_loss(self, context): true_labels_np = self.true_labels.to_host(context) probs_np = self.probs.to_host(context) if hasattr(self, 'mask'): mask = self.mask.to_host(context) context.add_callback(self._calculate_ce_loss, true_labels_np, probs_np, mask) else: context.add_callback(self._calculate_ce_loss, true_labels_np, probs_np) def _calculate_ce_loss(self, true_labels_np, probs_np, mask=None): if self.true_labels.dtype == 'int': idxs = range(probs_np.shape[0]), true_labels_np.flatten() logs = np.log(probs_np[idxs] + 1e-20) else: logs = np.log(np.sum(true_labels_np * probs_np, axis=1) + 1e-20) if mask is not None: logs *= mask[:, 0] self.loss = - np.sum(logs) / np.sum(mask) else: self.loss = - np.mean(logs)
class DataBlock(object): def __init__(self, word_to_idx, device_id): self.context = Context(device_id) device_id = self.context.device_id self.word_idx = Connector(Matrix.empty(1, 1, 'int', device_id)) self.word_to_idx = word_to_idx self.word = None def fprop(self): word_npa = np.zeros((1, 1), np.int32, 'F') word_npa[0][0] = self.word_to_idx[self.word] if self.word in self.word_to_idx else self.word_to_idx['<UNK>'] self.word_idx.assign_npa(self.context, word_npa) self.word_idx.fprop()
def __init__(self, dropout_prob, x, seed=42, device_id=None): self.dropout_prob = dropout_prob self.f_context = Context(device_id) device_id = self.f_context.device_id self.generator = Matrix.get_random_generator(seed) if x.bpropagable: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.output = Matrix.empty_like(self.x) self.output = Connector(self.output, device_id if x.bpropagable else None) self.training_mode = True
class DataBlock(object): def __init__(self, char_to_idx, device_id): self.context = Context(device_id) device_id = self.context.device_id self.char_idx = Connector(Matrix.empty(1, 1, 'int', device_id)) self.char_to_idx = char_to_idx self.char = None def fprop(self): char_npa = np.zeros((1, 1), np.int32, 'F') char_npa[0][0] = self.char_to_idx[self.char] if self.char in self.char_to_idx else self.char_to_idx['<unk>'] self.char_idx.assign_npa(self.context, char_npa) self.char_idx.fprop()
class GaussianNoiseBlock(object): """ Adds Gaussian noise to the block's input. Adding Gaussian noise can be viewed as a regularization. Parameters ---------- mean : float Expected value of Gaussian noise std : float Standard deviation of added Gaussian noise x : matrix Block's input seed : int Seed for :func:`quagga.cuda.curand.create_generator` device_id: int Defines the device's id on which the computation will take place """ def __init__(self, mean, std, x, seed=42, device_id=None): self.mean = mean self.std = std self.f_context = Context(device_id) device_id = self.f_context.device_id self.generator = Matrix.get_random_generator(seed) if x.bpropagable: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.output = Matrix.empty_like(self.x) self.output = Connector(self.output, device_id if x.bpropagable else None) self.training_mode = True def fprop(self): if self.training_mode: self.x.add_gaussian_noise(self.f_context, self.generator, self.mean, self.std, self.output) else: self.output.assign(self.f_context, self.x) self.output.fprop() def bprop(self): self.dL_dx.add(self.b_context, self.output.backward_matrix) def set_training_mode(self): self.training_mode = True def set_testing_mode(self): self.training_mode = False
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim_x, dim_y = self.rng.random_integers(1500, size=2) x = [ self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] y = [ self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_gpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_gpu = SequentialHorizontalStackBlock(x_gpu, y_gpu) x_gpu.length = sequence_len y_gpu.length = sequence_len if sequence_len == 0: pass seq_hstack_block_gpu.fprop() output_sequence_gpu = seq_hstack_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_cpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_cpu = SequentialHorizontalStackBlock(x_cpu, y_cpu) x_cpu.length = sequence_len y_cpu.length = sequence_len seq_hstack_block_cpu.fprop() output_sequence_cpu = seq_hstack_block_cpu.output.to_host() for out_gpu, out_cpu in izip(output_sequence_gpu, output_sequence_cpu): if not np.allclose(out_gpu, out_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
class DataBlock(object): def __init__(self, data, char_to_idx, batch_size, x_device_id, y_device_id): self.data = HomogeneousDataIterator(data, char_to_idx, batch_size, True, True) self.data_iterator = iter(self.data) self.x_context = Context(x_device_id) self.y_context = Context(y_device_id) max_len = 0 for sub_line in data: cur_len = len(sub_line) if cur_len > max_len: max_len = cur_len print max_len self.x = Connector( Matrix.empty(batch_size, max_len - 1, 'int', x_device_id)) self._y = Matrix.empty(batch_size, max_len - 1, 'int', y_device_id) self.y = List([Connector(self._y[:, i]) for i in xrange(max_len - 1)], self.x.ncols) self.lengths = Matrix.empty(self.x.nrows, 1, 'int', x_device_id) self._mask = Matrix.empty(self.x.nrows, self.x.ncols, 'float', x_device_id) self.mask = List( [Connector(self._mask[:, i]) for i in xrange(max_len)], self.x.ncols) self.blocking_contexts = None def fprop(self): self.x_context.wait(*self.blocking_contexts) self.y_context.wait(*self.blocking_contexts) data = next(self.data_iterator) lengths_npa = np.array([[len(e) - 1] for e in data], np.int32, order='F') x_npa = np.zeros((len(data), int(np.max(lengths_npa))), np.int32, 'F') for k, e in enumerate(data): x_npa[k, :len(e) - 1] = e[:-1] self.x.assign_npa(self.x_context, x_npa) y_npa = np.zeros((len(data), int(np.max(lengths_npa))), np.int32, 'F') for k, e in enumerate(data): y_npa[k, :len(e) - 1] = e[1:] self._y.assign_npa(self.y_context, y_npa) for e in self.y: e.last_modification_context = self.y_context self.lengths.assign_npa(self.x_context, lengths_npa) self._mask.mask_column_numbers_row_wise(self.x_context, self.lengths) for e in self.mask: e.last_modification_context = self.x_context self.x.fprop() self.y.fprop() self.mask.fprop()
class DropoutBlock(object): """ Sets elements of input matrix ``x`` to zero with probability ``dropout_prob`` in training mode. Scales ``x`` by factor of ``1-dropout_prob`` during in testing mode. Parameters ---------- dropout_prob : float x : Matrix (GpuMatrix or CpuMatrix) seed : int device_id : int Defines the device's id on which the computation will take place Notes ----- The dropout block is a regularizer that randomly sets input values to zero in training mode. This procedure is supposed to improve generalization. During testing, the dropout block scales input values. """ def __init__(self, dropout_prob, x, seed=42, device_id=None): self.dropout_prob = dropout_prob self.f_context = Context(device_id) device_id = self.f_context.device_id self.generator = Matrix.get_random_generator(seed) if x.bpropagable: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.output = Matrix.empty_like(self.x) self.output = Connector(self.output, device_id if x.bpropagable else None) self.training_mode = True def fprop(self): if self.training_mode: self.x.dropout(self.f_context, self.generator, self.dropout_prob, self.output) else: self.x.scale(self.f_context, 1.0 - self.dropout_prob, self.output) self.output.fprop() def bprop(self): if hasattr(self, 'dL_dx') and self.training_mode: dL_doutput = self.output.backward_matrix self.dL_dx.add_mask_zeros(self.b_context, dL_doutput, self.output) def set_training_mode(self): self.training_mode = True def set_testing_mode(self): self.training_mode = False
class AttentionBlock(object): """ Location based attention block out = sum_{i=1}^{T}a_i * h_i a_i = softmax(h_i * u) """ def __init__(self, matrices, u, mask=None, device_id=None): self.context = Context(device_id) device_id = self.context.device_id self.output = Matrix.empty_like(matrices[0], device_id) learning = matrices[0].bpropagable or u.bpropagable self.output = Connector(self.output, device_id if learning else None) if matrices[0].bpropagable: self.matrices, self.dL_dmatrices = \ izip(*matrices.register_usage(device_id, device_id)) else: self.matrices = matrices.register_usage(device_id) self.length = matrices.length if u.bpropagable: self.u, self.dL_du = u.register_usage(device_id, device_id) else: self.u = u.register_usage(device_id) if mask: self.mask = mask.register_usage(device_id) self.a = Matrix.empty(matrices[0].nrows, matrices.length, 'float', device_id) self.dL_dpre_a = Matrix.empty_like(self.a) self.a_cols = [self.a[:, i] for i in xrange(len(self.matrices))] def fprop(self): for i in xrange(self.length): self.a_cols[i].assign_dot(self.context, self.matrices[i], self.u) if hasattr(self, 'mask'): self.a.fill(self.context, -3.402823466e+38, self.mask, 0.0) self.a.softmax(self.context, self.a) self.output.assign_sequential_weighted_sum(self.context, self.a, self.matrices[:self.length]) self.output.fprop() def bprop(self): dL_doutput = self.output.backward_matrix self.dL_dpre_a.assign_dL_dpre_a(self.context, dL_doutput, self.a, self.matrices[:self.length]) if hasattr(self, 'dL_dmatrices'): Matrix.add_attention_tile(self.context, dL_doutput, self.a, self.dL_dpre_a, self.u, self.dL_dmatrices[:self.length]) if hasattr(self, 'dL_du'): self.dL_du.add_attention_derivative(self.context, self.dL_dpre_a, self.matrices[:self.length])
class GaussianNoiseBlock(object): """ Adds Gaussian noise to the block's input. Adding Gaussian noise can be viewed as a regularization. Parameters ---------- mean : float Expected value of Gaussian noise std : float Standard deviation of added Gaussian noise x : matrix Block's input seed : int Seed for :func:`~quagga.cuda.curand.create_generator` device_id: int Defines the device's id on which the computation will take place """ def __init__(self, mean, std, x, seed=42, device_id=None): self.mean = mean self.std = std self.f_context = Context(device_id) device_id = self.f_context.device_id self.generator = Matrix.get_random_generator(seed) if x.bpropagable: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.output = Matrix.empty_like(self.x) self.output = Connector(self.output, device_id if x.bpropagable else None) self.training_mode = True def fprop(self): if self.training_mode: self.x.add_gaussian_noise(self.f_context, self.generator, self.mean, self.std, self.output) else: self.output.assign(self.f_context, self.x) self.output.fprop() def bprop(self): self.dL_dx.add(self.b_context, self.output.backward_matrix) def set_training_mode(self): self.training_mode = True def set_testing_mode(self): self.training_mode = False
class SigmoidCeBlock(object): """ Sigmoid nonlinearity with mean cross entropy loss """ def __init__(self, x, true_labels, mask=None, device_id=None): self.context = Context(device_id) device_id = self.context.device_id if x.bpropagable: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.true_labels = true_labels.register_usage(device_id) if mask: self.mask = mask.register_usage(device_id) self.probs = Connector(Matrix.empty_like(self.x)) self.loss = None def fprop(self): self.x.sigmoid(self.context, self.probs) self.probs.fprop() def bprop(self): # error = (probs - true_labels) / M self.dL_dx.add_scaled_subtraction(self.context, 1. / float(self.probs.nrows), self.probs, self.true_labels) if hasattr(self, 'mask'): self.dL_dx.hprod(self.context, self.mask) def calculate_loss(self, context): true_labels_np = self.true_labels.to_host(context) probs_np = self.probs.to_host(context) if hasattr(self, 'mask'): mask = self.mask.to_host(context) context.add_callback(self._calculate_ce_loss, true_labels_np, probs_np, mask) else: context.add_callback(self._calculate_ce_loss, true_labels_np, probs_np) def _calculate_ce_loss(self, true_labels_np, probs_np, mask=None): logs = true_labels_np * np.log(probs_np + 1e-20) + \ (1.0 - true_labels_np) * np.log(1. - probs_np + 1e-20) if mask is not None: logs *= mask self.loss = - np.sum(logs) / (np.sum(mask) * logs.shape[1]) else: self.loss = - np.mean(logs)
def test_bprop(self): r = [] for i in xrange(self.N): matrices = [] nrows = self.rng.random_integers(1, 3000) ncols = [0] col_slices = [] device_ids = [] for _ in xrange(self.rng.random_integers(1, 10)): _ncols = self.rng.random_integers(1, 2000) ncols.append(ncols[-1] + _ncols) if self.rng.choice([True, False]): device_ids.append(0) col_slices.append((ncols[-2], ncols[-1])) else: device_ids.append(None) matrices.append(self.rng.rand(nrows, _ncols).astype(np.float32)) true_labels = self.rng.randint(ncols[-1], size=(nrows, 1)).astype(np.int32) if not col_slices: r.append(True) continue output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qmatrices = [Connector(Matrix.from_npa(m), d_id) for m, d_id in izip(matrices, device_ids)] qtrue_labels = Connector(Matrix.from_npa(true_labels)) hstack_block = HorizontalStackBlock(*qmatrices) sce_block = SoftmaxCeBlock(hstack_block.output, qtrue_labels) for m in qmatrices: m.fprop() qtrue_labels.fprop() hstack_block.fprop() sce_block.fprop() sce_block.bprop() hstack_block.bprop() output[processor_type] = [m.backward_matrix.to_host() for m in qmatrices if m.bpropagable] for dL_dm_gpu, dL_dm_cpu in izip(output['gpu'], output['cpu']): if not np.allclose(dL_dm_gpu, dL_dm_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def __init__(self, x, nonlinearity, device_id=None): """ """ self.f_context = Context(device_id) device_id = self.f_context.device_id self.learning = x.bpropagable if self.learning: self.b_context = Context(device_id) self.x, self.dL_dx = x.register_usage(device_id, device_id) self._df_dpref = Matrix.empty_like(self.x, device_id) else: self.x = x.register_usage(device_id) output = Matrix.empty_like(x, device_id) self.output = Connector(output, device_id if self.learning else None) if nonlinearity == "sigmoid": self.f = self.x.sigmoid elif nonlinearity == "tanh": self.f = self.x.tanh elif nonlinearity == "relu": self.f = self.x.relu elif nonlinearity == "softmax": raise ValueError("For softmax nonlinearity use SoftmaxBlock!") else: raise ValueError("TODO!") self.training_mode = True
class HorizontalStackBlock(object): """ Concatenates input matrices horizontally. Parameters ---------- matrices : Matrix (GpuMatrix or CpuMatrix) Input matrices that need to be concatenated. device_id: int Defines the device's id on which the computation will take place """ def __init__(self, *matrices, **kwargs): # TODO(sergii): change hsplit to aditive_hsplit for propper gradients accumulation self.context = Context(kwargs.get('device_id')) device_id = self.context.device_id self.matrices = [] self.dL_dmatrices = [] self.bpropagable = [] for matrix in matrices: self.bpropagable.append(matrix.bpropagable) if matrix.bpropagable: matrix, dL_dmatrix = matrix.register_usage(device_id, device_id) self.dL_dmatrices.append(dL_dmatrix) else: matrix = matrix.register_usage(device_id) self.matrices.append(matrix) ncols = [matrix.ncols for matrix in matrices] ncols = sum([e for e in ncols[1:]], ncols[0]) dtype = matrices[0].dtype bu_device_id = device_id if self.dL_dmatrices else None output = Matrix.empty(matrices[0].nrows, ncols, dtype, device_id) self.output = Connector(output, bu_device_id) def fprop(self): self.output.assign_hstack(self.context, self.matrices) self.output.fprop() def bprop(self): if self.dL_dmatrices: col_slices = [] ncols = [0] for matrix, bpropagable in izip(self.matrices, self.bpropagable): ncols.append(ncols[-1] + int(matrix.ncols)) if bpropagable: col_slices.append((ncols[-2], ncols[-1])) self.output.backward_matrix.hsplit(self.context, self.dL_dmatrices, col_slices)
class ScheduledSamplingBlock(object): def __init__(self, probs, true_labels, schedule, seed, device_id=None): self.schedule = schedule self.rnd = np.random.RandomState(seed) self.context = Context(device_id) device_id = self.context.device_id self.probs = probs.register_usage(device_id) self.true_labels = true_labels.register_usage(device_id) self.output = Connector(Matrix.empty_like(self.true_labels)) def fprop(self): if self.rnd.binomial(1, self.schedule.value): self.output.assign(self.context, self.true_labels) else: self.probs.argmax(self.context, self.output, axis=1) self.output.fprop()
def __init__(self, x, device_id=None): self.context = Context(device_id) device_id = self.context.device_id self.learning = x.bpropagable if self.learning: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) self.x = x.register_usage(device_id) self.output = Connector(Matrix.empty_like(self.x), device_id if self.learning else None)
def test_theano_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(2, max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.imatrix() th_true_labels = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SequentialSoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs, np.hstack(true_labels[:sequence_len])) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value(), atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): for sparse in [True, False]: batch_size, dim = self.rng.random_integers(2000, size=2) if sparse: true_labels = np.zeros((batch_size, dim), np.float32) for k, j in enumerate(self.rng.randint(dim, size=batch_size)): true_labels[k, j] = 1.0 else: true_labels = self.rng.randint(dim, size=(batch_size, 1)).astype(np.int32) x = self.rng.randn(batch_size, dim).astype(np.float32) mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32) device_id = 0 for with_mask in [False, True]: # Theano model th_x = T.fmatrix() th_mask = T.fcol() th_true_labels = T.fmatrix() if sparse else T.ivector() if with_mask: probs = T.nnet.softmax(th_mask * th_x) else: probs = T.nnet.softmax(th_x) loss = T.mean(T.nnet.categorical_crossentropy(probs, th_true_labels)) if with_mask: get_theano_grads = theano.function([th_x, th_true_labels, th_mask], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels if sparse else true_labels[:, 0], mask) else: get_theano_grads = theano.function([th_x, th_true_labels], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels if sparse else true_labels[:, 0]) # quagga model x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) mask_gpu = Connector(Matrix.from_npa(mask)) if with_mask else None softmax_ce_block = SoftmaxCeBlock(x_gpu, true_labels_gpu, mask_gpu) x_gpu.fprop() true_labels_gpu.fprop() if with_mask: mask_gpu.fprop() softmax_ce_block.fprop() softmax_ce_block.bprop() q_dL_dx = x_gpu.backward_matrix.to_host() r.append(np.allclose(th_dL_dx, q_dL_dx)) self.assertEqual(sum(r), len(r))
class RowSlicingBlock(object): def __init__(self, W, row_indexes, dense=True): self.dense = dense device_id = W.device_id self.context = Context(device_id) learning = W.bpropagable if learning: if dense: self.W, self.dL_dW = W.register_usage(device_id, device_id) else: self.W, self.dL_dW = W.register_usage_with_sparse_backward_matrix() else: self.W = W.register_usage(device_id) self.row_indexes = row_indexes.register_usage(device_id) if row_indexes.ncols > 1: self.output = [] for i in xrange(row_indexes.ncols): output = Matrix.empty(row_indexes.nrows, W.ncols, device_id=device_id) output = Connector(output, device_id if learning else None) self.output.append(output) self.output = List(self.output, row_indexes.ncols) else: output = Matrix.empty(row_indexes.nrows, W.ncols, device_id=device_id) self.output = Connector(output, device_id if learning else None) def fprop(self): if isinstance(self.output, List): self.W.slice_rows_batch(self.context, self.row_indexes, self.output) else: self.W.slice_rows(self.context, self.row_indexes, self.output) self.output.fprop() def bprop(self): if hasattr(self, 'dL_dW'): if isinstance(self.output, List): update_method = self.dL_dW.add_rows_batch_slice else: update_method = self.dL_dW.add_rows_slice if self.dense: update_method(self.context, self.row_indexes, self.output.bprop()) else: update_method(self.row_indexes, self.output.bprop())
class DataBlock(object): def __init__(self, data, char_to_idx, batch_size, x_device_id, y_device_id): self.data = HomogeneousDataIterator(data, char_to_idx, batch_size, True, True) self.data_iterator = iter(self.data) self.x_context = Context(x_device_id) self.y_context = Context(y_device_id) max_len = 0 for sub_line in data: cur_len = len(sub_line) if cur_len > max_len: max_len = cur_len print max_len self.x = Connector(Matrix.empty(batch_size, max_len - 1, 'int', x_device_id)) self._y = Matrix.empty(batch_size, max_len - 1, 'int', y_device_id) self.y = List([Connector(self._y[:, i]) for i in xrange(max_len - 1)], self.x.ncols) self.lengths = Matrix.empty(self.x.nrows, 1, 'int', x_device_id) self._mask = Matrix.empty(self.x.nrows, self.x.ncols, 'float', x_device_id) self.mask = List([Connector(self._mask[:, i]) for i in xrange(max_len)], self.x.ncols) self.blocking_contexts = None def fprop(self): self.x_context.wait(*self.blocking_contexts) self.y_context.wait(*self.blocking_contexts) data = next(self.data_iterator) lengths_npa = np.array([[len(e) - 1] for e in data], np.int32, order='F') x_npa = np.zeros((len(data), int(np.max(lengths_npa))), np.int32, 'F') for k, e in enumerate(data): x_npa[k, :len(e) - 1] = e[:-1] self.x.assign_npa(self.x_context, x_npa) y_npa = np.zeros((len(data), int(np.max(lengths_npa))), np.int32, 'F') for k, e in enumerate(data): y_npa[k, :len(e) - 1] = e[1:] self._y.assign_npa(self.y_context, y_npa) for e in self.y: e.last_modification_context = self.y_context self.lengths.assign_npa(self.x_context, lengths_npa) self._mask.mask_column_numbers_row_wise(self.x_context, self.lengths) for e in self.mask: e.last_modification_context = self.x_context self.x.fprop() self.y.fprop() self.mask.fprop()
def test_fprop(self): r = [] for i in xrange(self.N): repeats = self.rng.random_integers(42) axis = self.rng.randint(2) input_dim, output_dim = self.rng.random_integers(2000, size=2) x = self.get_normal_matrix(input_dim, output_dim) output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = Connector(Matrix.from_npa(x)) repeat_block = RepeatBlock(qx, repeats, axis) qx.fprop() repeat_block.fprop() output[processor_type] = repeat_block.output.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))