def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() ner = nlp.add_pipe("ner") ner.add_label("answer") with pytest.raises(ValueError): nlp.initialize(**cfg)
def test_implicit_labels(): nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 train_examples = make_examples(nlp) nlp.initialize(get_examples=lambda: train_examples) assert spancat.labels == ("PERSON", "LOC")
def test_implicit_label(): nlp = Language() nlp.add_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.initialize(get_examples=lambda: train_examples)
def nlp(): nlp = Language(Vocab()) textcat = nlp.add_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) nlp.initialize() return nlp
def test_explicit_labels(): nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 spancat.add_label("PERSON") spancat.add_label("LOC") nlp.initialize() assert spancat.labels == ("PERSON", "LOC")
def test_error_with_multi_labels(): nlp = Language() nlp.add_pipe("textcat") train_examples = [] for text, annotations in TRAIN_DATA_MULTI_LABEL: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) with pytest.raises(ValueError): nlp.initialize(get_examples=lambda: train_examples)
def test_ner_labels_added_implicitly_on_greedy_parse(): nlp = Language() ner = nlp.add_pipe("beam_ner") for label in ["A", "B", "C"]: ner.add_label(label) nlp.initialize() doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) ner.greedy_parse([doc]) assert "D" in ner.labels
def test_implicit_labels(): nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.initialize(get_examples=lambda: train_examples) assert spancat.labels == ("PERSON", "LOC")
def test_no_resize(): nlp = Language() morphologizer = nlp.add_pipe("morphologizer") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") nlp.initialize() # this throws an error because the morphologizer can't be resized after initialization with pytest.raises(ValueError): morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
def test_tagger_initialize_tag_map(): """Test that Tagger.initialize() without gold tuples does not clobber the tag map.""" nlp = Language() tagger = nlp.add_pipe("tagger") orig_tag_count = len(tagger.labels) tagger.add_label("A") nlp.initialize() assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
def tagger(): nlp = Language() tagger = nlp.add_pipe("tagger") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.add_label("A") nlp.initialize() return tagger
def test_no_resize(name): nlp = Language() textcat = nlp.add_pipe(name) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") nlp.initialize() assert textcat.model.get_dim("nO") >= 2 # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL")
def test_ner_labels_added_implicitly_on_predict(): nlp = Language() ner = nlp.add_pipe("ner") for label in ["A", "B", "C"]: ner.add_label(label) nlp.initialize() doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) ner(doc) assert [t.ent_type_ for t in doc] == ["D", ""] assert "D" in ner.labels
def test_ner_labels_added_implicitly_on_update(): nlp = Language() ner = nlp.add_pipe("ner") for label in ["A", "B", "C"]: ner.add_label(label) nlp.initialize() doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) example = Example(nlp.make_doc(doc.text), doc) assert "D" not in ner.labels nlp.update([example]) assert "D" in ner.labels
def test_no_resize(): nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat.add_label("Thing") spancat.add_label("Phrase") assert spancat.labels == ("Thing", "Phrase") nlp.initialize() assert spancat.model.get_dim("nO") == 2 # this throws an error because the spancat can't be resized after initialization with pytest.raises(ValueError): spancat.add_label("Stuff")
def test_issue2564(): """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") nlp.initialize() doc = nlp("hello world") assert doc.has_annotation("TAG") docs = nlp.pipe(["hello", "world"]) piped_doc = next(docs) assert piped_doc.has_annotation("TAG")
def test_no_resize(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("N") tagger.add_label("V") assert tagger.labels == ("N", "V") nlp.initialize() assert tagger.model.get_dim("nO") == 2 # this throws an error because the tagger can't be resized after initialization with pytest.raises(ValueError): tagger.add_label("J")
def test_no_resize(name, textcat_config): """The old textcat architectures weren't resizable""" nlp = Language() pipe_config = {"model": textcat_config} textcat = nlp.add_pipe(name, config=pipe_config) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") nlp.initialize() assert textcat.model.maybe_get_dim("nO") in [2, None] # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL")
def test_label_types(name): nlp = Language() textcat = nlp.add_pipe(name) textcat.add_label("answer") with pytest.raises(ValueError): textcat.add_label(9) # textcat requires at least two labels if name == "textcat": with pytest.raises(ValueError): nlp.initialize() else: nlp.initialize()
def test_issue9904(): nlp = Language() textcat = nlp.add_pipe("textcat") get_examples = make_get_examples_single_label(nlp) nlp.initialize(get_examples) examples = get_examples() scores = textcat.predict([eg.predicted for eg in examples]) loss = textcat.get_loss(examples, scores)[0] loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] assert loss == pytest.approx(loss_double_bs)
def test_resize(name, textcat_config): """The new textcat architectures are resizable""" nlp = Language() pipe_config = {"model": textcat_config} textcat = nlp.add_pipe(name, config=pipe_config) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") assert textcat.model.maybe_get_dim("nO") in [2, None] nlp.initialize() assert textcat.model.maybe_get_dim("nO") in [2, None] textcat.add_label("NEUTRAL") assert textcat.model.maybe_get_dim("nO") in [3, None]
def test_doc_gc(): # If the Doc object is garbage collected, the spans won't be functional afterwards nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat.add_label("PERSON") nlp.initialize() texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."] all_spans = [doc.spans for doc in nlp.pipe(texts)] for text, spangroups in zip(texts, all_spans): assert isinstance(spangroups, SpanGroups) for key, spangroup in spangroups.items(): assert isinstance(spangroup, SpanGroup) assert len(spangroup) > 0 with pytest.raises(RuntimeError): span = spangroup[0]
def entity_linker(): nlp = Language() def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) return kb entity_linker = nlp.add_pipe("entity_linker") entity_linker.set_kb(create_kb) # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization nlp.initialize() return entity_linker
def test_initialize_examples(): nlp = Language() lemmatizer = nlp.add_pipe("trainable_lemmatizer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: train_examples[0]) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: []) with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples)
def test_initialize_from_labels(): nlp = Language() lemmatizer = nlp.add_pipe("trainable_lemmatizer") lemmatizer.min_tree_freq = 1 train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.initialize(get_examples=lambda: train_examples) nlp2 = Language() lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer") lemmatizer2.initialize( get_examples=lambda: train_examples, labels=lemmatizer.label_data, ) assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
def test_simple_train(): nlp = Language() textcat = nlp.add_pipe("textcat") textcat.add_label("answer") nlp.initialize() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update((text, {"cats": {"answer": answer}})) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_language_source_and_vectors(nlp2): nlp = Language(Vocab()) textcat = nlp.add_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) nlp.initialize() long_string = "thisisalongstring" assert long_string not in nlp.vocab.strings assert long_string not in nlp2.vocab.strings nlp.vocab.strings.add(long_string) assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes() vectors_bytes = nlp.vocab.vectors.to_bytes() with pytest.warns(UserWarning): nlp2.add_pipe("textcat", name="textcat2", source=nlp) # strings should be added assert long_string in nlp2.vocab.strings # vectors should remain unmodified assert nlp.vocab.vectors.to_bytes() == vectors_bytes
def test_simple_train(): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) get_examples = make_get_examples(nlp) nlp.initialize(get_examples) sgd = nlp.create_optimizer() assert len(spancat.labels) != 0 for i in range(40): losses = {} nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd) doc = nlp("I like London and Berlin.") assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] assert len(doc.spans[spancat.key]) == 2 assert doc.spans[spancat.key][0].text == "London" scores = nlp.evaluate(get_examples()) assert f"spans_{SPAN_KEY}_f" in scores assert scores[f"spans_{SPAN_KEY}_f"] == 1.0
def test_set_candidates(): nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp) nlp.initialize(get_examples=lambda: train_examples) texts = [ "Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham.", ] docs = [nlp(text) for text in texts] spancat.set_candidates(docs) assert len(docs) == len(texts) assert type(docs[0].spans["candidates"]) == SpanGroup assert len(docs[0].spans["candidates"]) == 9 assert docs[0].spans["candidates"][0].text == "Just" assert docs[0].spans["candidates"][4].text == "Just a"
def test_initialize_examples(name, get_examples, train_data): nlp = Language() textcat = nlp.add_pipe(name) for text, annotations in train_data: for label, value in annotations.get("cats").items(): textcat.add_label(label) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=get_examples(nlp)) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=get_examples())