def from_bytes(self, bytes_data: bytes) -> "DocBin": """Deserialize the DocBin's annotations from a bytestring. bytes_data (bytes): The data to load from. RETURNS (DocBin): The loaded DocBin. DOCS: https://spacy.io/api/docbin#from_bytes """ try: msg = srsly.msgpack_loads(zlib.decompress(bytes_data)) except zlib.error: raise ValueError(Errors.E1014) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.frombuffer(msg["lengths"], dtype="int32") flat_spaces = numpy.frombuffer(msg["spaces"], dtype=bool) flat_tokens = numpy.frombuffer(msg["tokens"], dtype="uint64") shape = (flat_tokens.size // len(self.attrs), len(self.attrs)) flat_tokens = flat_tokens.reshape(shape) flat_spaces = flat_spaces.reshape((flat_spaces.size, 1)) self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths) self.cats = msg["cats"] self.span_groups = msg.get("span_groups", [b"" for _ in lengths]) self.flags = msg.get("flags", [{} for _ in lengths]) if "user_data" in msg: self.user_data = list(msg["user_data"]) else: self.user_data = [None] * len(self) for tokens in self.tokens: assert len( tokens.shape) == 2, tokens.shape # this should never happen return self
def get_expected_predict(input_data, Ws, bs): numpy_ops = NumpyOps() X = input_data for i, (W, b) in enumerate(zip(Ws, bs)): X = numpy_ops.asarray(X) if i > 0: X *= X > 0 X = numpy.tensordot(X, W, axes=[[1], [1]]) + b return X
def test_apply_alignment(nested_align, X_cols): ops = NumpyOps() align = get_ragged(ops, nested_align) X_shape = (align.data.max() + 1, X_cols) X = ops.alloc2f(*X_shape) Y, get_dX = apply_alignment(ops, align, X) assert isinstance(Y, Ragged) assert Y.data.shape[0] == align.data.shape[0] assert Y.lengths.shape[0] == len(nested_align) dX = get_dX(Y) assert dX.shape == X.shape
def test_truncate_alignment_from_end(sequences, max_length, align, mask_from_end): # print("Max length", max_length) # print("Sequences", sequences) # print("Mask", mask_from_end) ops = NumpyOps() truncated = _truncate_alignment(align, mask_from_end) # print(truncated.dataXd.shape, truncated.lengths.sum()) # print("Before", list(map(list, ops.unflatten(align.dataXd, align.lengths)))) # print("After", list(map(list, ops.unflatten(truncated.dataXd, truncated.lengths)))) # Check that the number of tokens hasn't changed. We still need to have # alignment for every token. assert truncated.lengths.shape[0] == align.lengths.shape[0] start = 0 for i, seq in enumerate(sequences): end = start + len(seq) # Get the alignment for this sequence of tokens. Each length in the # alignment indicates the number of wordpiece tokens, so we need to # check that the sum of the lengths doesn't exceed the maximum. wp_indices = truncated[start:end] assert wp_indices.lengths.sum() <= max_length # We're truncating from the end, so we shouldn't see different values # except at the end of the sequence. seen_zero = False before = align[start:end] for length_now, length_before in zip(wp_indices.lengths, before.lengths): if seen_zero: assert length_now == 0, wp_indices.lengths elif length_now == 0: seen_zero = True else: length_now == length_before
def test_list2padded(): ops = NumpyOps() seqs = [numpy.zeros((5, 4)), numpy.zeros((8, 4)), numpy.zeros((2, 4))] padded = ops.list2padded(seqs) arr = padded.data size_at_t = padded.size_at_t assert arr.shape == (8, 3, 4) assert size_at_t[0] == 3 assert size_at_t[1] == 3 assert size_at_t[2] == 2 assert size_at_t[3] == 2 assert size_at_t[4] == 2 assert size_at_t[5] == 1 assert size_at_t[6] == 1 assert size_at_t[7] == 1 unpadded = ops.padded2list(padded) assert unpadded[0].shape == (5, 4) assert unpadded[1].shape == (8, 4) assert unpadded[2].shape == (2, 4)
def test_large_seq2col_gpu_against_cpu(nW): cupy_ops = CupyOps() numpy_ops = NumpyOps() # Use array with a large enough batch to require multiple # CUDA grids. batch_size = 128 * 128 * 2 # threads per block * blocks * 2 X = numpy_ops.xp.random.randn(batch_size * 2).astype("float32").reshape( -1, 2) X_gpu = cupy_ops.asarray2f(X) # Use somewhat interesting sequence lengths. lengths = numpy_ops.asarray1i([1, 4, 2, 1] * (batch_size // 8)) lengths_gpu = cupy_ops.asarray1i(lengths) cols = numpy_ops.seq2col(X, nW=nW, lengths=lengths) cols_gpu = cupy_ops.seq2col(X_gpu, nW=nW, lengths=lengths_gpu) assert_allclose(cols, cols_gpu.get())
def from_batch_encoding(cls, token_data: BatchEncoding) -> "WordpieceBatch": assert isinstance(token_data, BatchEncoding) or isinstance( token_data, dict) pad_token = token_data.get("pad_token", "[PAD]") lengths = [ len([tok for tok in tokens if tok != pad_token]) for tokens in token_data["input_texts"] ] numpy_ops = NumpyOps() return cls( strings=token_data["input_texts"], input_ids=numpy_ops.asarray2i(token_data["input_ids"]), attention_mask=numpy_ops.asarray2f(token_data["attention_mask"]), lengths=lengths, token_type_ids=(numpy_ops.asarray2i(token_data["token_type_ids"]) if "token_type_ids" in token_data else None), )
def set_backend(name, gpu_id): global CONFIG if name == "jax": set_current_ops(JaxOps()) CONFIG = CONFIG.replace("PyTorch", "") else: if gpu_id == -1: set_current_ops(NumpyOps()) else: set_current_ops(CupyOps()) CONFIG = CONFIG.replace("LSTM.v1", "PyTorchLSTM.v1")
def set_backend(name, gpu_id): global CONFIG if name == "generic": set_current_ops(Ops()) else: if gpu_id == -1: set_current_ops(NumpyOps(use_blis=True)) else: set_current_ops(CupyOps()) if name == "pytorch": import torch torch.set_num_threads(1) CONFIG = CONFIG.replace("LSTM.v1", "PyTorchLSTM.v1")
def test_partials_from_config(): """Test that functions registered with partial applications are handled correctly (e.g. initializers).""" name = "uniform_init.v1" cfg = {"test": {"@initializers": name, "lo": -0.2}} func = my_registry.make_from_config(cfg)["test"] assert hasattr(func, "__call__") # The partial will still have lo as an arg, just with default assert len(inspect.signature(func).parameters) == 4 # Make sure returned partial function has correct value set assert inspect.signature(func).parameters["lo"].default == -0.2 # Actually call the function and verify func(NumpyOps(), (2, 3)) # Make sure validation still works bad_cfg = {"test": {"@initializers": name, "lo": [0.5]}} with pytest.raises(ConfigValidationError): my_registry.make_from_config(bad_cfg) bad_cfg = {"test": {"@initializers": name, "lo": -0.2, "other": 10}} with pytest.raises(ConfigValidationError): my_registry.make_from_config(bad_cfg)
def sgd(): return SGD(0.001, ops=NumpyOps())
def ops(): return NumpyOps()
from typing import List, Optional from numpy.testing import assert_almost_equal from thinc.api import registry, with_padded, Dropout, NumpyOps, Model from thinc.backends import NumpyOps from thinc.util import data_validation, get_width from thinc.types import Ragged, Padded, Array2d, Floats2d, FloatsXd, Shape from thinc.util import has_torch import numpy import pytest OPS = NumpyOps() class NoDropoutOps(NumpyOps): def get_dropout_mask(self, shape: Shape, drop: Optional[float]) -> FloatsXd: if drop is None or drop <= 0: return self.xp.ones(shape, dtype="f") else: raise ValueError( "During prediction, dropout should not be applied") array1d = OPS.xp.asarray([1, 2, 3], dtype="f") array1dint = OPS.xp.asarray([1, 2, 3], dtype="i") array2d = OPS.xp.asarray([[4, 2, 3, 4], [1, 5, 3, 1], [9, 8, 5, 7]], dtype="f") array2dint = OPS.xp.asarray([[1, 2, 3], [4, 5, 6]], dtype="i") array3d = OPS.xp.zeros((3, 3, 3), dtype="f") ragged = Ragged(array2d, OPS.xp.asarray([2, 1], dtype="i")) padded = Padded(array3d, array1d, OPS.asarray1i([1, 2, 3, 4]),
def get_model(W_values, b_values): model = Linear(W_values.shape[0], W_values.shape[1], ops=NumpyOps()) model.initialize() model.set_param("W", W_values) model.set_param("b", b_values) return model
assert arr.shape == (8, 3, 4) assert size_at_t[0] == 3 assert size_at_t[1] == 3 assert size_at_t[2] == 2 assert size_at_t[3] == 2 assert size_at_t[4] == 2 assert size_at_t[5] == 1 assert size_at_t[6] == 1 assert size_at_t[7] == 1 unpadded = ops.padded2list(padded) assert unpadded[0].shape == (5, 4) assert unpadded[1].shape == (8, 4) assert unpadded[2].shape == (2, 4) @pytest.mark.parametrize("ops", [Ops(), NumpyOps()]) @pytest.mark.parametrize("nO,nI", [(1, 2), (2, 2), (100, 200), (9, 6)]) def test_LSTM_init_with_sizes(ops, nO, nI): model = with_padded(LSTM(nO, nI, depth=1)).initialize() for node in model.walk(): model.ops = ops # Check no unallocated params. assert node.has_param("LSTM") is not None assert node.has_param("HC0") is not None for node in model.walk(): # Check param sizes. if node.has_param("LSTM"): params = node.get_param("LSTM") assert params.shape == ( ((nO * 4 * nI)) + (nO * 4) + (nO * 4 * nO + nO * 4), )
def get_input(nr_batch, nr_in): ops = NumpyOps() return ops.alloc2f(nr_batch, nr_in)
def set_backend(name, gpu_id): if gpu_id == -1: set_current_ops(NumpyOps()) else: set_current_ops(CupyOps()) CONFIG = CONFIG.replace("LSTM.v1", "PyTorchLSTM.v1")
def test_initializer_func_setup(init_func): ops = NumpyOps() data = numpy.ndarray([1, 2, 3, 4], dtype="f") result = init_func(ops, data.shape) assert not numpy.array_equal(data, result)
def test_initializer_from_config(name, kwargs): """Test that initializers are loaded and configured correctly from registry (as partials).""" cfg = {"test": {"@initializers": name, **kwargs}} func = registry.make_from_config(cfg)["test"] func(NumpyOps(), (1, 2, 3, 4))
from hypothesis.strategies import composite, integers from numpy.testing import assert_allclose from thinc.api import NumpyOps, CupyOps, Ops, get_ops from thinc.api import get_current_ops, use_ops from thinc.api import fix_random_seed from thinc.api import LSTM import inspect from .. import strategies from ..strategies import ndarrays_of_shape MAX_EXAMPLES = 10 VANILLA_OPS = Ops(numpy) NUMPY_OPS = NumpyOps() BLIS_OPS = NumpyOps(use_blis=True) CPU_OPS = [NUMPY_OPS, VANILLA_OPS] XP_OPS = [NUMPY_OPS] if CupyOps.xp is not None: XP_OPS.append(CupyOps()) ALL_OPS = XP_OPS + [VANILLA_OPS] @pytest.mark.parametrize("op", [NumpyOps, CupyOps]) def test_ops_consistency(op): """Test that specific ops don't define any methods that are not on the Ops base class and that all ops methods define the exact same arguments.""" attrs = [m for m in dir(op) if not m.startswith("_")] for attr in attrs: assert hasattr(Ops, attr)
def get_ops(): return NumpyOps()
with Model.define_operators({">>": chain}): model = model1 >> model2 >> model3 assert len(model.layers) == 2 assert len(model.layers[0].layers) == 2 def test_chain_right_branch(model1, model2, model3): # Previously we 'flattened' these nested calls. We might opt to do so # again, especially for the operators. merge1 = chain(model1, model2) merge2 = chain(merge1, model3) assert len(merge1.layers) == 2 assert len(merge2.layers) == 2 @pytest.mark.parametrize("ops", [NumpyOps(), NumpyOps(use_blis=True)]) def test_chain(ops): data = numpy.asarray([[1, 2, 3, 4]], dtype="f") model = chain(Linear(1), Dropout(), Linear(1)) model.ops = ops model.initialize(data, data) Y, backprop = model(data, is_train=True) backprop(Y) # Layers with and without nO/nI model = chain(Linear(1), Dropout(), Linear(1, 1)) model.initialize(data, data) # Setting dim on model model = chain(Linear(1), Dropout(), Linear(1)) model.set_dim("nO", 1) model.initialize(data, None) model = chain(Linear(1, 1), Dropout(), Linear(1, 1))
from typing import List import numpy import pytest from numpy.testing import assert_almost_equal from spacy.vocab import Vocab from thinc.api import NumpyOps, Model, data_validation from thinc.types import Array2d, Ragged from spacy.lang.en import English from spacy.ml import FeatureExtractor, StaticVectors from spacy.ml._character_embed import CharacterEmbed from spacy.tokens import Doc OPS = NumpyOps() texts = ["These are 4 words", "Here just three"] l0 = [[1, 2], [3, 4], [5, 6], [7, 8]] l1 = [[9, 8], [7, 6], [5, 4]] list_floats = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")] list_ints = [OPS.xp.asarray(l0, dtype="i"), OPS.xp.asarray(l1, dtype="i")] array = OPS.xp.asarray(l1, dtype="f") ragged = Ragged(array, OPS.xp.asarray([2, 1], dtype="i")) def get_docs(): vocab = Vocab() for t in texts: for word in t.split(): hash_id = vocab.strings.add(word) vector = numpy.random.uniform(-1, 1, (7, ))