示例#1
0
文件: rnn.py 项目: xy1802/PYNQ-Torch
    def forward(input, weight, hx, batch_sizes):
        if mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
        else:
            cx = None

        handle = cudnn.get_handle()
        with torch.cuda.device(input.get_device()):
            dropout_ts = cudnn.rnn.init_dropout_state(dropout, train, dropout_seed, dropout_state)

        weight_arr = list(itertools.chain.from_iterable(weight))
        weight_stride0 = len(weight[0])

        output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(
            input, weight_arr, weight_stride0,
            flat_weight,
            hx, cx,
            mode, hidden_size, num_layers,
            batch_first, dropout, train, bool(bidirectional),
            list(batch_sizes.data) if variable_length else (),
            dropout_ts)

        if cx is not None:
            return (output, (hy, cy))
        else:
            return (output, hy)
示例#2
0
    def forward(input, weight, hx, batch_sizes):
        if mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
        else:
            cx = None

        handle = cudnn.get_handle()
        dropout_ts = cudnn.rnn.init_dropout_state(torch.cuda.uint8, dropout, train, dropout_seed, dropout_state)

        weight_arr = list(itertools.chain.from_iterable(weight))
        weight_stride0 = len(weight[0])

        output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(
            input, weight_arr, weight_stride0,
            Variable(flat_weight) if flat_weight is not None else None,
            hx, cx,
            mode, hidden_size, num_layers,
            batch_first, dropout, train, bool(bidirectional),
            list(batch_sizes.data) if variable_length else (),
            dropout_ts)

        if cx is not None:
            return (output, (hy, cy))
        else:
            return (output, hy)
示例#3
0
文件: rnn.py 项目: pelluru/pytorch-1
    def forward(input, weight, hx, batch_sizes):
        if mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
        else:
            cx = None

        handle = cudnn.get_handle()
        dropout_desc = cudnn.rnn.init_dropout_descriptor(handle, dropout, train, dropout_seed, dropout_state)

        weight_arr = list(itertools.chain.from_iterable(weight))
        weight_stride0 = len(weight[0])

        output, hy, cy, reserve, new_weight_buf = torch._C._VariableFunctions._cudnn_rnn(
            input, weight_arr, weight_stride0,
            Variable(flat_weight) if flat_weight is not None else None,
            hx, cx,
            mode, hidden_size, num_layers,
            batch_first, dropout, train, bool(bidirectional),
            list(batch_sizes.data) if variable_length else (),
            Variable(dropout_desc.state) if dropout_desc.state is not None else None)

        if cx is not None:
            return (output, (hy, cy))
        else:
            return (output, hy)
示例#4
0
    def flatten_parameters(self):
        """Resets parameter data pointer so that they can use faster code paths.

        Right now, this works only if the module is on the GPU and cuDNN is enabled.
        Otherwise, it's a no-op.
        """
        any_param = next(self.parameters()).data
        if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(
                any_param):
            self._data_ptrs = []
            return

        with torch.cuda.device_of(any_param):
            # This is quite ugly, but it allows us to reuse the cuDNN code without larger
            # modifications. It's really a low-level API that doesn't belong in here, but
            # let's make this exception.
            from torch.backends.cudnn import rnn
            from torch.backends import cudnn
            from torch.nn._functions.rnn import CudnnRNN
            handle = cudnn.get_handle()
            with warnings.catch_warnings(record=True):
                fn = CudnnRNN(
                    self.mode,
                    self.input_size,
                    self.hidden_size,
                    num_layers=self.num_layers,
                    batch_first=self.batch_first,
                    dropout=self.dropout,
                    train=self.training,
                    bidirectional=self.bidirectional,
                    dropout_state=self.dropout_state,
                )

            # Initialize descriptors
            fn.datatype = cudnn._typemap[any_param.type()]
            fn.x_descs = cudnn.descriptor(any_param.new(1, self.input_size), 1)
            fn.rnn_desc = rnn.init_rnn_descriptor(fn, handle)

            # Allocate buffer to hold the weights
            self._param_buf_size = rnn.get_num_weights(handle, fn.rnn_desc,
                                                       fn.x_descs[0],
                                                       fn.datatype)
            fn.weight_buf = any_param.new(self._param_buf_size).zero_()
            fn.w_desc = rnn.init_weight_descriptor(fn, fn.weight_buf)

            # Slice off views into weight_buf
            params = rnn.get_parameters(fn, handle, fn.weight_buf)
            all_weights = [[p.data for p in l] for l in self.all_weights]

            # Copy weights and update their storage
            rnn._copyParams(all_weights, params)
            for orig_layer_param, new_layer_param in zip(all_weights, params):
                for orig_param, new_param in zip(orig_layer_param,
                                                 new_layer_param):
                    orig_param.set_(new_param.view_as(orig_param))

            self._data_ptrs = list(p.data.data_ptr()
                                   for p in self.parameters())
示例#5
0
文件: rnn.py 项目: Northrend/pytorch
    def flatten_parameters(self):
        """Resets parameter data pointer so that they can use faster code paths.

        Right now, this works only if the module is on the GPU and cuDNN is enabled.
        Otherwise, it's a no-op.
        """
        any_param = next(self.parameters()).data
        if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param):
            self._data_ptrs = []
            return

        with torch.cuda.device_of(any_param):
            # This is quite ugly, but it allows us to reuse the cuDNN code without larger
            # modifications. It's really a low-level API that doesn't belong in here, but
            # let's make this exception.
            from torch.backends.cudnn import rnn
            from torch.backends import cudnn
            from torch.nn._functions.rnn import CudnnRNN
            handle = cudnn.get_handle()
            with warnings.catch_warnings(record=True):
                fn = CudnnRNN(
                    self.mode,
                    self.input_size,
                    self.hidden_size,
                    num_layers=self.num_layers,
                    batch_first=self.batch_first,
                    dropout=self.dropout,
                    train=self.training,
                    bidirectional=self.bidirectional,
                    dropout_state=self.dropout_state,
                )

            # Initialize descriptors
            fn.datatype = cudnn._typemap[any_param.type()]
            fn.x_descs = cudnn.descriptor(any_param.new(1, self.input_size), 1)
            fn.rnn_desc = rnn.init_rnn_descriptor(fn, handle)

            # Allocate buffer to hold the weights
            self._param_buf_size = rnn.get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype)
            fn.weight_buf = any_param.new(self._param_buf_size).zero_()
            fn.w_desc = rnn.init_weight_descriptor(fn, fn.weight_buf)

            # Slice off views into weight_buf
            params = rnn.get_parameters(fn, handle, fn.weight_buf)
            all_weights = [[p.data for p in l] for l in self.all_weights]

            # Copy weights and update their storage
            rnn._copyParams(all_weights, params)
            for orig_layer_param, new_layer_param in zip(all_weights, params):
                for orig_param, new_param in zip(orig_layer_param, new_layer_param):
                    orig_param.set_(new_param.view_as(orig_param))

            self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
示例#6
0
def backward_weight(fn, input, hx, output, weight, grad_weight):
    with torch.cuda.device_of(input):
        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
        else:
            cx = None

        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            output = output.transpose(0, 1)
        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
        if not fn.requires_grad:
            raise RuntimeError(
                'backward_weight can only be called when the function requires grad!'
            )
        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError(
                'dropout supported only in cudnn v 5.1 and above')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
        if tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                hidden_size, hx.size()))

        assert hx.is_contiguous()
        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        y = output
        dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_()

        with torch.cuda.device_of(input):
            workspace = torch.cuda.ByteTensor(fn.workspace_size)
        check_error(
            cudnn.lib.cudnnRNNBackwardWeights(
                handle, fn.rnn_desc, fn.seq_length, fn.x_descs,
                ctypes.c_void_p(x.data_ptr()), fn.hx_desc,
                ctypes.c_void_p(hx.data_ptr()), fn.y_descs,
                ctypes.c_void_p(y.data_ptr()),
                ctypes.c_void_p(workspace.data_ptr()), workspace.size(0),
                fn.w_desc, ctypes.c_void_p(dw.data_ptr()),
                ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)))

        # copy the weights from the weight_buf into grad_weight
        grad_params = get_parameters(fn, handle, dw)
        _copyParams(grad_params, grad_weight)
        return grad_weight
示例#7
0
def backward_weight(fn, input, hx, output, weight, grad_weight):
    with torch.cuda.device_of(input):
        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
        else:
            cx = None

        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            output = output.transpose(0, 1)
        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
        if not fn.requires_grad:
            raise RuntimeError('backward_weight can only be called when the function requires grad!')
        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
        if tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                hidden_size, hx.size()))

        assert hx.is_contiguous()
        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        y = output
        dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_()

        with torch.cuda.device_of(input):
            workspace = torch.cuda.ByteTensor(fn.workspace_size)
        check_error(cudnn.lib.cudnnRNNBackwardWeights(
            handle,
            fn.rnn_desc,
            fn.seq_length,
            fn.x_descs, ctypes.c_void_p(x.data_ptr()),
            fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
            fn.y_descs, ctypes.c_void_p(y.data_ptr()),
            ctypes.c_void_p(workspace.data_ptr()), workspace.size(0),
            fn.w_desc, ctypes.c_void_p(dw.data_ptr()),
            ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
        ))

        # copy the weights from the weight_buf into grad_weight
        grad_params = get_parameters(fn, handle, dw)
        _copyParams(grad_params, grad_weight)
        return grad_weight
示例#8
0
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy,
                  grad_input, grad_hx):
    with torch.cuda.device_of(input):
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
            grad_hx, grad_cx = grad_hx
            grad_hy, grad_cy = grad_hy
        else:
            cx, grad_cx, grad_cy = None, None, None

        if fn.batch_first:
            input = input.transpose(0, 1)
            grad_output = grad_output.transpose(0, 1)
            output = output.transpose(0, 1)

        input_size = _input_size(fn)
        hidden_size = _hidden_size(fn)
        output_size = _output_size(fn)

        x = input.contiguous()
        dy = grad_output.contiguous()
        y = output
        w = fn.weight_buf
        dx = grad_input.resize_as_(input)
        dhy = grad_hy.resize_(*hidden_size)
        dcy = grad_cy.resize_(*hidden_size) if grad_cy else None
        dhx = grad_hx.resize_(*hidden_size)
        dcx = grad_cx.resize_(*hidden_size) if grad_cx else None

        if fn.dropout != 0 and lib.version < 5103:
            raise RuntimeError(
                'dropout supported only in cudnn v 5.1 and above')
        if not fn.train:
            raise RuntimeError(
                'backward_grad can only be called when training!')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
        if tuple(output.size()) != _output_size(fn):
            raise RuntimeError('Expected output size {}, got {}'.format(
                output_size, output.size()))
        if hx and tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected hidden size {}, got {}'.format(
                hidden_size, hx.size()))
        if cx and tuple(cx.size()) != hidden_size:
            raise RuntimeError('Expected cell size {}, got {}'.format(
                hidden_size, cx.size()))
        if dhy and tuple(dhy.size()) != hidden_size:
            raise RuntimeError('Expected d_hidden size {}, got {}'.format(
                hidden_size, dhy.size()))
        if dcy and tuple(dcy.size()) != hidden_size:
            raise RuntimeError('Expected d_cell size {}, got {}'.format(
                hidden_size, dcy.size()))

        check_error(
            cudnn.lib.cudnnRNNBackwardData(
                handle, fn.rnn_desc, fn.seq_length, fn.y_descs,
                ctypes.c_void_p(y.data_ptr()), fn.y_descs,
                ctypes.c_void_p(dy.data_ptr()), fn.hy_desc,
                ctypes.c_void_p(dhy.data_ptr()), fn.cy_desc,
                ctypes.c_void_p(dcy.data_ptr()) if cx else None, fn.w_desc,
                ctypes.c_void_p(w.data_ptr()), fn.hx_desc,
                ctypes.c_void_p(hx.data_ptr()), fn.cx_desc,
                ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.x_descs,
                ctypes.c_void_p(dx.data_ptr()), fn.hx_desc,
                ctypes.c_void_p(dhx.data_ptr()), fn.cx_desc,
                ctypes.c_void_p(dcx.data_ptr()) if cx else None,
                ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0),
                ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)))

        if fn.batch_first:
            grad_input = grad_input.transpose(0, 1)
示例#9
0
def forward(fn, input, hx, weight, output, hy):
    with torch.cuda.device_of(input):
        lib = cudnn.lib
        handle = cudnn.get_handle()
        fn.datatype = cudnn._typemap[input.type()]

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
            hy, cy = hy
        else:
            cx, cy = None, None

        if fn.batch_first:
            input = input.transpose(0, 1)

        if input.dim() != 3:
            raise RuntimeError('input must have 3 dimensions, got {}'.format(
                input.dim()))
        if fn.input_size != input.size(2):
            raise RuntimeError(
                'input.size(2) must be equal to input_size. Expected {}, got {}'
                .format(fn.input_size))
        if fn.dropout != 0 and cudnn.lib.version < 5103:
            raise RuntimeError(
                'dropout supported only in cudnn v5.1 and above')

        fn.seq_length, fn.mini_batch, fn.input_size = input.size()
        hidden_size = _hidden_size(fn)
        output_size = _output_size(fn)
        x = input.contiguous()
        output.resize_(*output_size)
        hy.resize_(*hidden_size).zero_()
        if cy:
            cy.resize_(*hidden_size).zero_()
        y = output

        # init descriptors
        fn.dropout_desc = init_dropout_descriptor(fn, handle)
        fn.rnn_desc = init_rnn_descriptor(fn)
        fn.x_descs = cudnn.descriptor(x[0], fn.seq_length)
        fn.y_descs = cudnn.descriptor(y[0], fn.seq_length)
        fn.hx_desc = cudnn.descriptor(hx)
        fn.hy_desc = cudnn.descriptor(hx)
        fn.cx_desc = cudnn.descriptor(cx) if cx else None
        fn.cy_desc = cudnn.descriptor(cx) if cx else None

        # create the weight buffer and copy the weights into it
        num_weights = get_num_weights(handle, fn.rnn_desc, fn.x_descs[0],
                                      fn.datatype)
        fn.weight_buf = input.new(num_weights)
        fn.w_desc = init_weight_descriptor(fn, fn.weight_buf)
        w = fn.weight_buf
        # this zero might not seem necessary, but it is in the case
        # where biases are disabled; then they won't be copied and must be zero'd.
        # Alternatively, _copyParams could be written more carefully.
        w.zero_()
        params = get_parameters(fn, handle, w)
        _copyParams(weight, params)

        if tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected hidden size {}, got {}'.format(
                hidden_size, tuple(hx.size())))
        if cx and tuple(cx.size()) != hidden_size:
            raise RuntimeError('Expected cell size {}, got {}'.format(
                hidden_size, tuple(cx.size())))

        workspace_size = ctypes.c_long()
        check_error(
            lib.cudnnGetRNNWorkspaceSize(handle, fn.rnn_desc, fn.seq_length,
                                         fn.x_descs,
                                         ctypes.byref(workspace_size)))
        fn.workspace = torch.cuda.ByteTensor(workspace_size.value)
        if fn.train:
            reserve_size = ctypes.c_long()
            check_error(
                lib.cudnnGetRNNTrainingReserveSize(handle, fn.rnn_desc,
                                                   fn.seq_length, fn.x_descs,
                                                   ctypes.byref(reserve_size)))
            fn.reserve = torch.cuda.ByteTensor(reserve_size.value)

            check_error(
                lib.cudnnRNNForwardTraining(
                    handle, fn.rnn_desc, fn.seq_length, fn.x_descs,
                    ctypes.c_void_p(x.data_ptr()), fn.hx_desc,
                    ctypes.c_void_p(hx.data_ptr()), fn.cx_desc,
                    ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.w_desc,
                    ctypes.c_void_p(w.data_ptr()), fn.y_descs,
                    ctypes.c_void_p(y.data_ptr()), fn.hy_desc,
                    ctypes.c_void_p(hy.data_ptr()), fn.cy_desc,
                    ctypes.c_void_p(cy.data_ptr()) if cx else None,
                    ctypes.c_void_p(fn.workspace.data_ptr()),
                    fn.workspace.size(0),
                    ctypes.c_void_p(fn.reserve.data_ptr()),
                    fn.reserve.size(0)))
        else:  # inference
            check_error(
                lib.cudnnRNNForwardInference(
                    handle, fn.rnn_desc, fn.seq_length, fn.x_descs,
                    ctypes.c_void_p(x.data_ptr()), fn.hx_desc,
                    ctypes.c_void_p(hx.data_ptr()), fn.cx_desc,
                    ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.w_desc,
                    ctypes.c_void_p(w.data_ptr()), fn.y_descs,
                    ctypes.c_void_p(y.data_ptr()), fn.hy_desc,
                    ctypes.c_void_p(hy.data_ptr()), fn.cy_desc,
                    ctypes.c_void_p(cy.data_ptr()) if cx else None,
                    ctypes.c_void_p(fn.reserve.data_ptr()),
                    fn.reserve.size(0)))

        if fn.batch_first:
            output = output.transpose(0, 1)
示例#10
0
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx):
    with torch.cuda.device_of(input):
        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
            grad_hx, grad_cx = grad_hx
            grad_hy, grad_cy = grad_hy
        else:
            cx, grad_cx, grad_cy = None, None, None

        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            grad_output = grad_output.transpose(0, 1)
            output = output.transpose(0, 1)

        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
        output_size = _output_size(fn, input)

        assert hx.is_contiguous()
        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        dy = grad_output.contiguous()
        y = output
        w = fn.weight_buf
        dx = grad_input.resize_as_(input)
        dhy = grad_hy.contiguous().view(*hidden_size)
        dcy = grad_cy.contiguous().view(*hidden_size) if grad_cy is not None else None
        dhx = grad_hx.resize_(*hidden_size)
        dcx = grad_cx.resize_(*hidden_size) if grad_cx is not None else None

        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
        if not fn.requires_grad:
            raise RuntimeError('backward_grad can only be called when the function requires grad!')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
        if tuple(output.size()) != output_size:
            raise RuntimeError('Expected output size {}, got {}'.format(
                output_size, output.size()))
        if hx is not None and tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected hidden size {}, got {}'.format(
                hidden_size, hx.size()))
        if cx is not None and tuple(cx.size()) != hidden_size:
            raise RuntimeError('Expected cell size {}, got {}'.format(
                hidden_size, cx.size()))
        if dhy is not None and tuple(dhy.size()) != hidden_size:
            raise RuntimeError('Expected d_hidden size {}, got {}'.format(
                hidden_size, dhy.size()))
        if dcy is not None and tuple(dcy.size()) != hidden_size:
            raise RuntimeError('Expected d_cell size {}, got {}'.format(
                hidden_size, dcy.size()))
        if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda):
            raise RuntimeError('Gradients aren\'t CUDA tensors')

        with torch.cuda.device_of(input):
            workspace = torch.cuda.ByteTensor(fn.workspace_size)
        check_error(cudnn.lib.cudnnRNNBackwardData(
            handle,
            fn.rnn_desc,
            fn.seq_length,
            fn.y_descs, ctypes.c_void_p(y.data_ptr()),
            fn.y_descs, ctypes.c_void_p(dy.data_ptr()),
            fn.hy_desc, ctypes.c_void_p(dhy.data_ptr()),
            fn.cy_desc, ctypes.c_void_p(dcy.data_ptr()) if cx is not None else None,
            fn.w_desc, ctypes.c_void_p(w.data_ptr()),
            fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
            fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None,
            fn.x_descs, ctypes.c_void_p(dx.data_ptr()),
            fn.hx_desc, ctypes.c_void_p(dhx.data_ptr()),
            fn.cx_desc, ctypes.c_void_p(dcx.data_ptr()) if cx is not None else None,
            ctypes.c_void_p(workspace.data_ptr()), workspace.size(0),
            ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
        ))

        if fn.batch_first and not is_input_packed:
            grad_input = grad_input.transpose_(0, 1)
示例#11
0
def forward(fn, input, hx, weight, output, hy):
    with torch.cuda.device_of(input):
        lib = cudnn.lib
        handle = cudnn.get_handle()
        fn.datatype = cudnn._typemap[input.type()]
        is_input_packed = fn.batch_sizes is not None

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
            hy, cy = hy
        else:
            cx, cy = None, None

        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)

        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v5.1 and above')

        if is_input_packed:
            fn.seq_length = len(fn.batch_sizes)
            fn.mini_batch = fn.batch_sizes[0]
            fn.input_size = input.size(-1)
        else:
            fn.seq_length, fn.mini_batch, fn.input_size = input.size()
        hidden_size = _hidden_size(fn)
        output_size = _output_size(fn, input)

        assert hx.is_contiguous()
        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        output.resize_(*output_size)
        hy.resize_(*hidden_size)
        if cy is not None:
            cy.resize_(*hidden_size)
        y = output

        # init descriptors
        fn.rnn_desc = init_rnn_descriptor(fn, handle)
        if is_input_packed:
            fn.x_descs = cudnn.descriptor_sequence(x, fn.batch_sizes)
            fn.y_descs = cudnn.descriptor_sequence(y, fn.batch_sizes)
        else:
            fn.x_descs = cudnn.descriptor(x[0], fn.seq_length)
            fn.y_descs = cudnn.descriptor(y[0], fn.seq_length)
        fn.hx_desc = cudnn.descriptor(hx)
        fn.hy_desc = cudnn.descriptor(hx)
        fn.cx_desc = cudnn.descriptor(cx) if cx is not None else None
        fn.cy_desc = cudnn.descriptor(cx) if cx is not None else None

        # create the weight buffer and copy the weights into it
        if fn.weight_buf is None:
            num_weights = get_num_weights(
                handle, fn.rnn_desc, fn.x_descs[0], fn.datatype)
            fn.weight_buf = x.new(num_weights)
            fn.w_desc = init_weight_descriptor(fn, fn.weight_buf)
            w = fn.weight_buf
            # this zero might not seem necessary, but it is in the case
            # where biases are disabled; then they won't be copied and must be zero'd.
            # Alternatively, _copyParams could be written more carefully.
            w.zero_()
            params = get_parameters(fn, handle, w)
            _copyParams(weight, params)
        else:
            fn.w_desc = init_weight_descriptor(fn, fn.weight_buf)
            w = fn.weight_buf

        if cx is not None and tuple(cx.size()) != hidden_size:
            raise RuntimeError('Expected cell size {}, got {}'.format(
                hidden_size, tuple(cx.size())))

        workspace_size = ctypes.c_long()
        check_error(lib.cudnnGetRNNWorkspaceSize(
            handle,
            fn.rnn_desc,
            fn.seq_length,
            fn.x_descs,
            ctypes.byref(workspace_size)
        ))
        fn.workspace_size = workspace_size.value
        with torch.cuda.device_of(input):
            workspace = torch.cuda.ByteTensor(fn.workspace_size)
        if fn.requires_grad:
            reserve_size = ctypes.c_long()
            check_error(lib.cudnnGetRNNTrainingReserveSize(
                handle,
                fn.rnn_desc,
                fn.seq_length,
                fn.x_descs,
                ctypes.byref(reserve_size)
            ))
            fn.reserve = torch.cuda.ByteTensor(reserve_size.value)

            check_error(lib.cudnnRNNForwardTraining(
                handle,
                fn.rnn_desc,
                fn.seq_length,
                fn.x_descs, ctypes.c_void_p(x.data_ptr()),
                fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
                fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None,
                fn.w_desc, ctypes.c_void_p(w.data_ptr()),
                fn.y_descs, ctypes.c_void_p(y.data_ptr()),
                fn.hy_desc, ctypes.c_void_p(hy.data_ptr()),
                fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None,
                ctypes.c_void_p(workspace.data_ptr()), workspace.size(0),
                ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
            ))
        else:  # inference
            check_error(lib.cudnnRNNForwardInference(
                handle,
                fn.rnn_desc,
                fn.seq_length,
                fn.x_descs, ctypes.c_void_p(x.data_ptr()),
                fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
                fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None,
                fn.w_desc, ctypes.c_void_p(w.data_ptr()),
                fn.y_descs, ctypes.c_void_p(y.data_ptr()),
                fn.hy_desc, ctypes.c_void_p(hy.data_ptr()),
                fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None,
                ctypes.c_void_p(workspace.data_ptr()), workspace.size(0)
            ))

        if fn.batch_first and not is_input_packed:
            output.transpose_(0, 1)
示例#12
0
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy,
                  grad_input, grad_hx):
    with torch.cuda.device_of(input):
        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
            grad_hx, grad_cx = grad_hx
            grad_hy, grad_cy = grad_hy
        else:
            cx, grad_cx, grad_cy = None, None, None

        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            grad_output = grad_output.transpose(0, 1)
            output = output.transpose(0, 1)

        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
        output_size = _output_size(fn, input)

        assert hx.is_contiguous()
        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        dy = grad_output.contiguous()
        y = output
        w = fn.weight_buf
        dx = grad_input.resize_as_(input)
        dhy = grad_hy.contiguous().view(*hidden_size)
        dcy = grad_cy.contiguous().view(
            *hidden_size) if grad_cy is not None else None
        dhx = grad_hx.resize_(*hidden_size)
        dcx = grad_cx.resize_(*hidden_size) if grad_cx is not None else None

        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError(
                'dropout supported only in cudnn v 5.1 and above')
        if not fn.requires_grad:
            raise RuntimeError(
                'backward_grad can only be called when the function requires grad!'
            )
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
        if tuple(output.size()) != output_size:
            raise RuntimeError('Expected output size {}, got {}'.format(
                output_size, output.size()))
        if hx is not None and tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected hidden size {}, got {}'.format(
                hidden_size, hx.size()))
        if cx is not None and tuple(cx.size()) != hidden_size:
            raise RuntimeError('Expected cell size {}, got {}'.format(
                hidden_size, cx.size()))
        if dhy is not None and tuple(dhy.size()) != hidden_size:
            raise RuntimeError('Expected d_hidden size {}, got {}'.format(
                hidden_size, dhy.size()))
        if dcy is not None and tuple(dcy.size()) != hidden_size:
            raise RuntimeError('Expected d_cell size {}, got {}'.format(
                hidden_size, dcy.size()))
        if not dhy.is_cuda or not dy.is_cuda or (dcy is not None
                                                 and not dcy.is_cuda):
            raise RuntimeError('Gradients aren\'t CUDA tensors')

        check_error(
            cudnn.lib.cudnnRNNBackwardData(
                handle, fn.rnn_desc, fn.seq_length, fn.y_descs,
                ctypes.c_void_p(y.data_ptr()), fn.y_descs,
                ctypes.c_void_p(dy.data_ptr()), fn.hy_desc,
                ctypes.c_void_p(dhy.data_ptr()), fn.cy_desc,
                ctypes.c_void_p(dcy.data_ptr()) if cx is not None else None,
                fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.hx_desc,
                ctypes.c_void_p(hx.data_ptr()), fn.cx_desc,
                ctypes.c_void_p(cx.data_ptr()) if cx is not None else None,
                fn.x_descs, ctypes.c_void_p(dx.data_ptr()), fn.hx_desc,
                ctypes.c_void_p(dhx.data_ptr()), fn.cx_desc,
                ctypes.c_void_p(dcx.data_ptr()) if cx is not None else None,
                ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0),
                ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)))

        if fn.batch_first and not is_input_packed:
            grad_input = grad_input.transpose_(0, 1)