예제 #1
0
파일: _serialize.py 프로젝트: nitsel/spaCy
    def from_bytes(self, bytes_data: bytes) -> "DocBin":
        """Deserialize the DocBin's annotations from a bytestring.

        bytes_data (bytes): The data to load from.
        RETURNS (DocBin): The loaded DocBin.

        DOCS: https://spacy.io/api/docbin#from_bytes
        """
        try:
            msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
        except zlib.error:
            raise ValueError(Errors.E1014)
        self.attrs = msg["attrs"]
        self.strings = set(msg["strings"])
        lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
        flat_spaces = numpy.frombuffer(msg["spaces"], dtype=bool)
        flat_tokens = numpy.frombuffer(msg["tokens"], dtype="uint64")
        shape = (flat_tokens.size // len(self.attrs), len(self.attrs))
        flat_tokens = flat_tokens.reshape(shape)
        flat_spaces = flat_spaces.reshape((flat_spaces.size, 1))
        self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
        self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
        self.cats = msg["cats"]
        self.span_groups = msg.get("span_groups", [b"" for _ in lengths])
        self.flags = msg.get("flags", [{} for _ in lengths])
        if "user_data" in msg:
            self.user_data = list(msg["user_data"])
        else:
            self.user_data = [None] * len(self)
        for tokens in self.tokens:
            assert len(
                tokens.shape) == 2, tokens.shape  # this should never happen
        return self
def get_expected_predict(input_data, Ws, bs):
    numpy_ops = NumpyOps()
    X = input_data
    for i, (W, b) in enumerate(zip(Ws, bs)):
        X = numpy_ops.asarray(X)
        if i > 0:
            X *= X > 0
        X = numpy.tensordot(X, W, axes=[[1], [1]]) + b
    return X
예제 #3
0
def test_apply_alignment(nested_align, X_cols):
    ops = NumpyOps()
    align = get_ragged(ops, nested_align)
    X_shape = (align.data.max() + 1, X_cols)
    X = ops.alloc2f(*X_shape)
    Y, get_dX = apply_alignment(ops, align, X)
    assert isinstance(Y, Ragged)
    assert Y.data.shape[0] == align.data.shape[0]
    assert Y.lengths.shape[0] == len(nested_align)
    dX = get_dX(Y)
    assert dX.shape == X.shape
def test_truncate_alignment_from_end(sequences, max_length, align, mask_from_end):
    # print("Max length", max_length)
    # print("Sequences", sequences)
    # print("Mask", mask_from_end)
    ops = NumpyOps()
    truncated = _truncate_alignment(align, mask_from_end)
    # print(truncated.dataXd.shape, truncated.lengths.sum())
    # print("Before", list(map(list, ops.unflatten(align.dataXd, align.lengths))))
    # print("After", list(map(list, ops.unflatten(truncated.dataXd, truncated.lengths))))
    # Check that the number of tokens hasn't changed. We still need to have
    # alignment for every token.
    assert truncated.lengths.shape[0] == align.lengths.shape[0]
    start = 0
    for i, seq in enumerate(sequences):
        end = start + len(seq)
        # Get the alignment for this sequence of tokens. Each length in the
        # alignment indicates the number of wordpiece tokens, so we need to
        # check that the sum of the lengths doesn't exceed the maximum.
        wp_indices = truncated[start:end]
        assert wp_indices.lengths.sum() <= max_length
        # We're truncating from the end, so we shouldn't see different values
        # except at the end of the sequence.
        seen_zero = False
        before = align[start:end]
        for length_now, length_before in zip(wp_indices.lengths, before.lengths):
            if seen_zero:
                assert length_now == 0, wp_indices.lengths
            elif length_now == 0:
                seen_zero = True
            else:
                length_now == length_before
예제 #5
0
def test_list2padded():
    ops = NumpyOps()
    seqs = [numpy.zeros((5, 4)), numpy.zeros((8, 4)), numpy.zeros((2, 4))]
    padded = ops.list2padded(seqs)
    arr = padded.data
    size_at_t = padded.size_at_t
    assert arr.shape == (8, 3, 4)
    assert size_at_t[0] == 3
    assert size_at_t[1] == 3
    assert size_at_t[2] == 2
    assert size_at_t[3] == 2
    assert size_at_t[4] == 2
    assert size_at_t[5] == 1
    assert size_at_t[6] == 1
    assert size_at_t[7] == 1
    unpadded = ops.padded2list(padded)
    assert unpadded[0].shape == (5, 4)
    assert unpadded[1].shape == (8, 4)
    assert unpadded[2].shape == (2, 4)
예제 #6
0
파일: test_ops.py 프로젝트: admariner/thinc
def test_large_seq2col_gpu_against_cpu(nW):
    cupy_ops = CupyOps()
    numpy_ops = NumpyOps()

    # Use array with a large enough batch to require multiple
    # CUDA grids.
    batch_size = 128 * 128 * 2  # threads per block * blocks * 2
    X = numpy_ops.xp.random.randn(batch_size * 2).astype("float32").reshape(
        -1, 2)
    X_gpu = cupy_ops.asarray2f(X)

    # Use somewhat interesting sequence lengths.
    lengths = numpy_ops.asarray1i([1, 4, 2, 1] * (batch_size // 8))
    lengths_gpu = cupy_ops.asarray1i(lengths)

    cols = numpy_ops.seq2col(X, nW=nW, lengths=lengths)
    cols_gpu = cupy_ops.seq2col(X_gpu, nW=nW, lengths=lengths_gpu)

    assert_allclose(cols, cols_gpu.get())
예제 #7
0
    def from_batch_encoding(cls,
                            token_data: BatchEncoding) -> "WordpieceBatch":
        assert isinstance(token_data, BatchEncoding) or isinstance(
            token_data, dict)
        pad_token = token_data.get("pad_token", "[PAD]")
        lengths = [
            len([tok for tok in tokens if tok != pad_token])
            for tokens in token_data["input_texts"]
        ]

        numpy_ops = NumpyOps()

        return cls(
            strings=token_data["input_texts"],
            input_ids=numpy_ops.asarray2i(token_data["input_ids"]),
            attention_mask=numpy_ops.asarray2f(token_data["attention_mask"]),
            lengths=lengths,
            token_type_ids=(numpy_ops.asarray2i(token_data["token_type_ids"])
                            if "token_type_ids" in token_data else None),
        )
예제 #8
0
def set_backend(name, gpu_id):
    global CONFIG
    if name == "jax":
        set_current_ops(JaxOps())
        CONFIG = CONFIG.replace("PyTorch", "")
    else:
        if gpu_id == -1:
            set_current_ops(NumpyOps())
        else:
            set_current_ops(CupyOps())
        CONFIG = CONFIG.replace("LSTM.v1", "PyTorchLSTM.v1")
예제 #9
0
def set_backend(name, gpu_id):
    global CONFIG
    if name == "generic":
        set_current_ops(Ops())
    else:
        if gpu_id == -1:
            set_current_ops(NumpyOps(use_blis=True))
        else:
            set_current_ops(CupyOps())
        if name == "pytorch":
            import torch
            torch.set_num_threads(1)
            CONFIG = CONFIG.replace("LSTM.v1", "PyTorchLSTM.v1")
예제 #10
0
def test_partials_from_config():
    """Test that functions registered with partial applications are handled
    correctly (e.g. initializers)."""
    name = "uniform_init.v1"
    cfg = {"test": {"@initializers": name, "lo": -0.2}}
    func = my_registry.make_from_config(cfg)["test"]
    assert hasattr(func, "__call__")
    # The partial will still have lo as an arg, just with default
    assert len(inspect.signature(func).parameters) == 4
    # Make sure returned partial function has correct value set
    assert inspect.signature(func).parameters["lo"].default == -0.2
    # Actually call the function and verify
    func(NumpyOps(), (2, 3))
    # Make sure validation still works
    bad_cfg = {"test": {"@initializers": name, "lo": [0.5]}}
    with pytest.raises(ConfigValidationError):
        my_registry.make_from_config(bad_cfg)
    bad_cfg = {"test": {"@initializers": name, "lo": -0.2, "other": 10}}
    with pytest.raises(ConfigValidationError):
        my_registry.make_from_config(bad_cfg)
예제 #11
0
def sgd():
    return SGD(0.001, ops=NumpyOps())
def ops():
    return NumpyOps()
예제 #13
0
from typing import List, Optional

from numpy.testing import assert_almost_equal
from thinc.api import registry, with_padded, Dropout, NumpyOps, Model
from thinc.backends import NumpyOps
from thinc.util import data_validation, get_width
from thinc.types import Ragged, Padded, Array2d, Floats2d, FloatsXd, Shape
from thinc.util import has_torch
import numpy
import pytest

OPS = NumpyOps()


class NoDropoutOps(NumpyOps):
    def get_dropout_mask(self, shape: Shape,
                         drop: Optional[float]) -> FloatsXd:
        if drop is None or drop <= 0:
            return self.xp.ones(shape, dtype="f")
        else:
            raise ValueError(
                "During prediction, dropout should not be applied")


array1d = OPS.xp.asarray([1, 2, 3], dtype="f")
array1dint = OPS.xp.asarray([1, 2, 3], dtype="i")
array2d = OPS.xp.asarray([[4, 2, 3, 4], [1, 5, 3, 1], [9, 8, 5, 7]], dtype="f")
array2dint = OPS.xp.asarray([[1, 2, 3], [4, 5, 6]], dtype="i")
array3d = OPS.xp.zeros((3, 3, 3), dtype="f")
ragged = Ragged(array2d, OPS.xp.asarray([2, 1], dtype="i"))
padded = Padded(array3d, array1d, OPS.asarray1i([1, 2, 3, 4]),
예제 #14
0
def get_model(W_values, b_values):
    model = Linear(W_values.shape[0], W_values.shape[1], ops=NumpyOps())
    model.initialize()
    model.set_param("W", W_values)
    model.set_param("b", b_values)
    return model
예제 #15
0
    assert arr.shape == (8, 3, 4)
    assert size_at_t[0] == 3
    assert size_at_t[1] == 3
    assert size_at_t[2] == 2
    assert size_at_t[3] == 2
    assert size_at_t[4] == 2
    assert size_at_t[5] == 1
    assert size_at_t[6] == 1
    assert size_at_t[7] == 1
    unpadded = ops.padded2list(padded)
    assert unpadded[0].shape == (5, 4)
    assert unpadded[1].shape == (8, 4)
    assert unpadded[2].shape == (2, 4)


@pytest.mark.parametrize("ops", [Ops(), NumpyOps()])
@pytest.mark.parametrize("nO,nI", [(1, 2), (2, 2), (100, 200), (9, 6)])
def test_LSTM_init_with_sizes(ops, nO, nI):
    model = with_padded(LSTM(nO, nI, depth=1)).initialize()
    for node in model.walk():
        model.ops = ops
        # Check no unallocated params.
        assert node.has_param("LSTM") is not None
        assert node.has_param("HC0") is not None
    for node in model.walk():
        # Check param sizes.
        if node.has_param("LSTM"):
            params = node.get_param("LSTM")
            assert params.shape == (
                ((nO * 4 * nI)) + (nO * 4) + (nO * 4 * nO + nO * 4),
            )
예제 #16
0
def get_input(nr_batch, nr_in):
    ops = NumpyOps()
    return ops.alloc2f(nr_batch, nr_in)
예제 #17
0
파일: lstm_tagger.py 프로젝트: vezir/thinc
def set_backend(name, gpu_id):
    if gpu_id == -1:
        set_current_ops(NumpyOps())
    else:
        set_current_ops(CupyOps())
    CONFIG = CONFIG.replace("LSTM.v1", "PyTorchLSTM.v1")
예제 #18
0
def test_initializer_func_setup(init_func):
    ops = NumpyOps()
    data = numpy.ndarray([1, 2, 3, 4], dtype="f")
    result = init_func(ops, data.shape)
    assert not numpy.array_equal(data, result)
예제 #19
0
def test_initializer_from_config(name, kwargs):
    """Test that initializers are loaded and configured correctly from registry
    (as partials)."""
    cfg = {"test": {"@initializers": name, **kwargs}}
    func = registry.make_from_config(cfg)["test"]
    func(NumpyOps(), (1, 2, 3, 4))
예제 #20
0
from hypothesis.strategies import composite, integers
from numpy.testing import assert_allclose
from thinc.api import NumpyOps, CupyOps, Ops, get_ops
from thinc.api import get_current_ops, use_ops
from thinc.api import fix_random_seed
from thinc.api import LSTM
import inspect

from .. import strategies
from ..strategies import ndarrays_of_shape


MAX_EXAMPLES = 10

VANILLA_OPS = Ops(numpy)
NUMPY_OPS = NumpyOps()
BLIS_OPS = NumpyOps(use_blis=True)
CPU_OPS = [NUMPY_OPS, VANILLA_OPS]
XP_OPS = [NUMPY_OPS]
if CupyOps.xp is not None:
    XP_OPS.append(CupyOps())
ALL_OPS = XP_OPS + [VANILLA_OPS]


@pytest.mark.parametrize("op", [NumpyOps, CupyOps])
def test_ops_consistency(op):
    """Test that specific ops don't define any methods that are not on the
    Ops base class and that all ops methods define the exact same arguments."""
    attrs = [m for m in dir(op) if not m.startswith("_")]
    for attr in attrs:
        assert hasattr(Ops, attr)
예제 #21
0
def get_ops():
    return NumpyOps()
예제 #22
0
    with Model.define_operators({">>": chain}):
        model = model1 >> model2 >> model3
        assert len(model.layers) == 2
        assert len(model.layers[0].layers) == 2


def test_chain_right_branch(model1, model2, model3):
    # Previously we 'flattened' these nested calls. We might opt to do so
    # again, especially for the operators.
    merge1 = chain(model1, model2)
    merge2 = chain(merge1, model3)
    assert len(merge1.layers) == 2
    assert len(merge2.layers) == 2


@pytest.mark.parametrize("ops", [NumpyOps(), NumpyOps(use_blis=True)])
def test_chain(ops):
    data = numpy.asarray([[1, 2, 3, 4]], dtype="f")
    model = chain(Linear(1), Dropout(), Linear(1))
    model.ops = ops
    model.initialize(data, data)
    Y, backprop = model(data, is_train=True)
    backprop(Y)
    # Layers with and without nO/nI
    model = chain(Linear(1), Dropout(), Linear(1, 1))
    model.initialize(data, data)
    # Setting dim on model
    model = chain(Linear(1), Dropout(), Linear(1))
    model.set_dim("nO", 1)
    model.initialize(data, None)
    model = chain(Linear(1, 1), Dropout(), Linear(1, 1))
예제 #23
0
from typing import List

import numpy
import pytest
from numpy.testing import assert_almost_equal
from spacy.vocab import Vocab
from thinc.api import NumpyOps, Model, data_validation
from thinc.types import Array2d, Ragged

from spacy.lang.en import English
from spacy.ml import FeatureExtractor, StaticVectors
from spacy.ml._character_embed import CharacterEmbed
from spacy.tokens import Doc

OPS = NumpyOps()

texts = ["These are 4 words", "Here just three"]
l0 = [[1, 2], [3, 4], [5, 6], [7, 8]]
l1 = [[9, 8], [7, 6], [5, 4]]
list_floats = [OPS.xp.asarray(l0, dtype="f"), OPS.xp.asarray(l1, dtype="f")]
list_ints = [OPS.xp.asarray(l0, dtype="i"), OPS.xp.asarray(l1, dtype="i")]
array = OPS.xp.asarray(l1, dtype="f")
ragged = Ragged(array, OPS.xp.asarray([2, 1], dtype="i"))


def get_docs():
    vocab = Vocab()
    for t in texts:
        for word in t.split():
            hash_id = vocab.strings.add(word)
            vector = numpy.random.uniform(-1, 1, (7, ))