Exemplo n.º 1
0
def test_dynamic_store_size():
    """Tests size of StringStore in vocab after creating doc"""

    me = hook.local_worker
    nlp = syfertext.load("en_core_web_lg", owner=me)

    # Check that no string is present in store
    assert len(nlp.vocab.store) == 0

    doc = nlp("quick brown fox jumps")

    # Check that 4 strings have been added in store
    assert len(nlp.vocab.store) == 4
Exemplo n.º 2
0
def test_number_of_subpipelines_created_with_pipes_of_different_remote_values(
):
    """Test the number of subpipelines created when we use pipe components
    with different remote values.
    """

    nlp = syfertext.load("en_core_web_lg", owner=me)

    # Add the pipeline components to SyferText pipeline
    nlp.add_pipe(noun_tagger, name="noun tagger", remote=True)
    nlp.add_pipe(verb_tagger, name="verb tagger", remote=False)
    nlp.add_pipe(pronoun_tagger, name="pronoun tagger", remote=False)
    nlp.add_pipe(adjective_tagger, name="adjective tagger", remote=True)

    # Note: The tokenizer is added by default with remote = True
    # And the adjacent pipes with same remote value are grouped together
    # in a single dictionary. So the nlp.subpipeline_templates should be
    #
    # nlp.subpipeline_templates = [ {'remote': True, 'names': ['tokenizer','noun tagger']},
    #                               {'remote': False, 'name': ['verb tagger', 'pronoun tagger']},
    #                               {'remote': True, 'name': ['adjective tagger']}
    #                             ]

    remote_subpipelines = [
        s for s in nlp.subpipeline_templates if (s["remote"] == True)
    ]
    local_subpipelines = [
        s for s in nlp.subpipeline_templates if (s["remote"] == False)
    ]

    # Assert subpipeline_templates contains 3 subpipelines,
    # two with remote = True and one with remote = False
    assert len(nlp.subpipeline_templates) == 3
    assert len(remote_subpipelines) == 2
    assert len(local_subpipelines) == 1

    # Assert the relative order of subpipelines in subpipeline templates
    for subpipeline, remote_value in zip(nlp.subpipeline_templates,
                                         [True, False, True]):
        assert subpipeline["remote"] == remote_value

    # Make sure the subpipelines contain the correct number of pipes
    for subpipeline, num_pipes in zip(nlp.subpipeline_templates, [2, 2, 1]):
        assert len(subpipeline["names"]) == num_pipes

    # Make sure subpipelines contains pipes in the correct order
    pipes = [["tokenizer", "noun tagger"], ["verb tagger", "pronoun tagger"],
             ["adjective tagger"]]
    for subpipeline, pipe_names in zip(nlp.subpipeline_templates, pipes):
        assert subpipeline["names"] == pipe_names
Exemplo n.º 3
0
def test_pipeline_output():

    nlp = syfertext.load("en_core_web_lg", owner=me)

    james = sy.VirtualWorker(hook, id="james")

    # Create a PySyft String and send it to remote worker james
    text_ptr = String("building SyferText").send(james)

    # Add tagger with remote = True
    tagger = SimpleTagger(attribute="noun", lookups=["SyferText"], tag=True)
    nlp.add_pipe(tagger, name="noun_tagger", remote=True)

    # Upon processing the text present on james's machine,
    # pipeline Should return a DocPointer to the doc on james's machine
    doc = nlp(text_ptr)
    assert isinstance(doc, DocPointer)

    # assert only one document object on james's machine
    documents = [v for v in james._objects.values() if isinstance(v, Doc)]
    assert len(documents) == 1

    # assert returned doc_pointer points to document object on james's machine
    assert doc.id_at_location == documents[0].id

    # assert only one subpipeline object on james's machine
    subpipelines = [
        v for v in james._objects.values() if isinstance(v, SubPipeline)
    ]
    assert len(subpipelines) == 1

    # Make sure subpipeline object contains tokenizer and tagger
    pipes = subpipelines[0].pipe_names
    assert len(pipes) == 2
    assert pipes[0] == "tokenizer"
    assert pipes[1] == "noun_tagger"

    # nlp.pipeline stores pointers to subpipeline objects on remote machines
    # assert subpipeline pointer stored in nlp.pipeline points to the subpipeline on james machine
    assert nlp.pipeline[0]["james"].id_at_location == subpipelines[0].id
Exemplo n.º 4
0
def test_addition_and_removal_of_pipeline_components():
    """Test the add_pipe and remove_pipe methods.
    """

    nlp = syfertext.load("en_core_web_lg", owner=me)

    # Add the pipeline components to SyferText pipeline
    nlp.add_pipe(noun_tagger, name="noun tagger")
    nlp.add_pipe(verb_tagger, name="verb tagger")

    # Note: Tokenizer is always the first component in any pipeline and
    # is added by default to the nlp.pipeline_template.
    # So the current state of the nlp.pipeline_template should be like this
    # nlp.pipeline_template = [{'remote': True, 'name': 'tokenizer'},
    #                          {'remote': False, 'name': 'noun tagger'},
    #                          {'remote': False, 'name': 'verb tagger'}]
    assert len(nlp.pipeline_template) == 3

    # Remove noun tagger from the pipeline
    nlp.remove_pipe(name="noun tagger")

    # Assert pipeline has two components
    assert len(nlp.pipeline_template) == 2
Exemplo n.º 5
0
import syft as sy
import torch
import syfertext

from syft.generic.string import String

from syfertext.doc import Doc
from syfertext.span import Span
from syfertext.pointers.doc_pointer import DocPointer
from syfertext.pointers.span_pointer import SpanPointer

hook = sy.TorchHook(torch)
me = hook.local_worker

nlp = syfertext.load("en_core_web_lg", owner=me)


def test_creation_of_basic_span():
    """Test the __getitem__() method of doc returns
    a Span when passed in a slice."""

    doc = nlp("the quick brown fox jumps over lazy dog")

    span = doc[1:5]

    actual_tokens = ["quick", "brown", "fox", "jumps"]

    assert len(span) == len(actual_tokens)

    for token, actual_token in zip(span, actual_tokens):
        assert token.text == actual_token
import syft as sy
import torch
import syfertext

hook = sy.TorchHook(torch)
me = hook.local_worker
lang = "en_core_web_lg"
nlp = syfertext.load(lang, owner=me)
vocab = nlp.vocab


def test_token_text_with_ws():
    text = "Green Apple "
    doc = nlp(text)
    tok1 = doc[0]
    tok2 = doc[1]

    assert tok1.text_with_ws + tok2.text_with_ws == text


def test_token_lex_id():
    text = "apple"

    # create a token object
    token = nlp(text)[0]

    # Get the Lexeme object from vocab
    lexeme = vocab[text]

    # test that lexeme rank and token lex_id are equal
    assert token.lex_id == lexeme.rank
Exemplo n.º 7
0
def test_subpipeline_is_not_recreated_in_remote_workers():
    """Test that the a subpipeline at a given index is not recreated in remote workers after it
    has been initialized once. Each worker contains a single subpipeline, with multiple components.
    """

    nlp = syfertext.load("en_core_web_lg", owner=me)

    alice = sy.VirtualWorker(hook, id="alice")
    bob = sy.VirtualWorker(hook, id="bob")

    # Create 4 PySyft Strings and send them to remote workers
    # (3 to Bob, 1 to Alice)
    texts = [String(text) for text in ["hello", "syfertext", "private", "nlp"]]

    texts_ptr = [
        texts[0].send(bob), texts[1].send(bob), texts[2].send(alice),
        texts[3].send(bob)
    ]

    # The first time a text owned by `bob` is tokenized, a `SubPipeline` object is
    # created by the `nlp` object and sent to `bob`. The `nlp` object keeps a
    # pointer to the tokenizer object in `bob`'s machine.
    doc1 = nlp(texts_ptr[0])
    subpipelines = [
        v for v in bob._objects.values() if isinstance(v, SubPipeline)
    ]
    documents = [v for v in bob._objects.values() if isinstance(v, Doc)]
    assert len(subpipelines) == 1
    assert len(documents) == 1
    assert len(nlp.pipeline[0].keys()) == 1
    assert "bob" in nlp.pipeline[0]

    # The second time a text owned by `bob` is tokenized, no new `SubPipeline`
    # objects are created. Only a new document on `bob`'s machine.
    doc2 = nlp(texts_ptr[1])
    subpipelines = [
        v for v in bob._objects.values() if isinstance(v, SubPipeline)
    ]
    documents = [v for v in bob._objects.values() if isinstance(v, Doc)]
    assert len(subpipelines) == 1
    assert len(documents) == 2
    assert len(nlp.pipeline[0].keys()) == 1

    # The first time a text owned by `alice` is tokenized, a new `SubPipeline` object
    # is created by the `nlp` object and sent to `alice`. Now the `nlp` object has
    # a second pointer to a `SubPipeline`.
    doc3 = nlp(texts_ptr[2])
    subpipelines = [
        v for v in alice._objects.values() if isinstance(v, SubPipeline)
    ]
    documents = [v for v in alice._objects.values() if isinstance(v, Doc)]
    assert len(subpipelines) == 1
    assert len(documents) == 1
    assert len(nlp.pipeline[0].keys()) == 2
    assert "alice" in nlp.pipeline[0]

    # The third time a text owned by `bob` is tokenized, no new `SupPipeline`
    # objects are created. The `nlp` object still has the same previous pointer
    # to a `SubPipeline` on `bob`'s machine and `bob` now has a third document.
    doc4 = nlp(texts_ptr[3])
    subpipelines = [
        v for v in bob._objects.values() if isinstance(v, SubPipeline)
    ]
    documents = [v for v in bob._objects.values() if isinstance(v, Doc)]
    assert len(subpipelines) == 1
    assert len(documents) == 3
    assert len(nlp.pipeline[0].keys()) == 2
Exemplo n.º 8
0
        # print for debugging purposes
        # print("mapping",example['label']," to ",one_hot_label)

        # Send the transcription label
        example['label'] = one_hot_label.send(worker)


# Bob's remote dataset
make_remote_dataset(train_bob, bob)
make_remote_dataset(val_bob, bob)

# Alice's remote dataset
make_remote_dataset(train_alice, alice)
make_remote_dataset(val_alice, alice)
# Create a Language object with SyferText
nlp = syfertext.load('en_core_web_lg', owner=me)
use_stop_tagger = True
use_vocab_tagger = True

# Token with these custom tags
# will be excluded from creating
# the Doc vector
excluded_tokens = {}
## Load the list of stop words
with open('./data/clinical-stopwords.txt', 'r') as f:
    stop_words = set(f.read().splitlines())

# Create a simple tagger object to tag stop words
stop_tagger = SimpleTagger(attribute='is_stop',
                           lookups=stop_words,
                           tag=True,