def test_doc_array_attr_of_token(en_vocab): doc = Doc(en_vocab, words=["An", "example", "sentence"]) example = doc.vocab["example"] assert example.orth != example.shape feats_array = doc.to_array((ORTH, SHAPE)) assert feats_array[0][0] != feats_array[0][1] assert feats_array[0][0] != feats_array[0][1]
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): """Create Doc object from given vocab, words and annotations.""" pos = pos or [""] * len(words) tags = tags or [""] * len(words) heads = heads or [0] * len(words) deps = deps or [""] * len(words) for value in deps + tags + pos: vocab.strings.add(value) doc = Doc(vocab, words=words) attrs = doc.to_array([POS, HEAD, DEP]) for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): attrs[i, 0] = doc.vocab.strings[p] attrs[i, 1] = head attrs[i, 2] = doc.vocab.strings[dep] doc.from_array([POS, HEAD, DEP], attrs) if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] if tags: for token in doc: token.tag_ = tags[token.i] return doc
def test_doc_array_to_from_string_attrs(en_vocab, attrs): """Test that both Doc.to_array and Doc.from_array accept string attrs, as well as single attrs and sequences of attrs. """ words = ["An", "example", "sentence"] doc = Doc(en_vocab, words=words) Doc(en_vocab, words=words).from_array(attrs, doc.to_array(attrs))
def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country)
def test_issue1547(): """Test that entity labels still match after merging tokens.""" words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] doc = Doc(Vocab(), words=words) doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] with doc.retokenize() as retokenizer: retokenizer.merge(doc[5:7]) assert [ent.text for ent in doc.ents]
def test_serialize_empty_doc(en_vocab): doc = Doc(en_vocab) data = doc.to_bytes() doc2 = Doc(en_vocab) doc2.from_bytes(data) assert len(doc) == len(doc2) for token1, token2 in zip(doc, doc2): assert token1.text == token2.text
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs=attrs)
def test_doc_api_similarity_match(): doc = Doc(Vocab(), words=["a"]) assert doc.similarity(doc[0]) == 1.0 assert doc.similarity(doc.vocab["a"]) == 1.0 doc2 = Doc(doc.vocab, words=["a", "b", "c"]) with pytest.warns(ModelsWarning): assert doc.similarity(doc2[:1]) == 1.0 assert doc.similarity(doc2) == 0.0
def test_doc_to_json_underscore(doc): Doc.set_extension("json_test1", default=False) Doc.set_extension("json_test2", default=False) doc._.json_test1 = "hello world" doc._.json_test2 = [1, 2, 3] json_doc = doc.to_json(underscore=["json_test1", "json_test2"]) assert "_" in json_doc assert json_doc["_"]["json_test1"] == "hello world" assert json_doc["_"]["json_test2"] == [1, 2, 3]
def test_underscore_dir(en_vocab): """Test that dir() correctly returns extension attributes. This enables things like tab-completion for the attributes in doc._.""" Doc.set_extension("test_dir", default=None) doc = Doc(en_vocab, words=["hello", "world"]) assert "_" in dir(doc) assert "test_dir" in dir(doc._) assert "test_dir" not in dir(doc[0]._) assert "test_dir" not in dir(doc[0:2]._)
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("x", default=False, force=True) Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
def test_spans_override_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" tokens = en_tokenizer(text) tokens.vocab[tokens[0].text].sentiment = 3.0 tokens.vocab[tokens[2].text].sentiment = -2.0 doc = Doc(tokens.vocab, words=[t.text for t in tokens]) doc.user_span_hooks["sentiment"] = lambda span: 10.0 assert doc[:2].sentiment == 10.0 assert doc[-2:].sentiment == 10.0 assert doc[:-1].sentiment == 10.0
def test_doc_retokenize_split_orths_mismatch(en_vocab): """Test that the regular retokenizer.split raises an error if the orths don't match the original token text. There might still be a method that allows this, but for the default use cases, merging and splitting should always conform with spaCy's non-destructive tokenization policy. Otherwise, it can lead to very confusing and unexpected results. """ doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
def test_doc_retokenize_split_heads_error(en_vocab): doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) # Not enough heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]]) # Too many heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
def test_doc_add_entities_set_ents_iob(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "lion"]) ner = EntityRecognizer(en_vocab) ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
def test_underscore_docstring(en_vocab): """Test that docstrings are available for extension methods, even though they're partials.""" def test_method(doc, arg1=1, arg2=2): """I am a docstring""" return (arg1, arg2) Doc.set_extension("test_docstrings", method=test_method) doc = Doc(en_vocab, words=["hello", "world"]) assert test_method.__doc__ == "I am a docstring" assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
def test_doc_retokenize_split_dependencies(en_vocab): doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) dep1 = doc.vocab.strings.add("amod") dep2 = doc.vocab.strings.add("subject") with doc.retokenize() as retokenizer: retokenizer.split( doc[0], ["Los", "Angeles"], [(doc[0], 1), doc[1]], attrs={"dep": [dep1, dep2]}, ) assert doc[0].dep == dep1 assert doc[1].dep == dep2
def test_doc_retokenize_spans_entity_split_iob(): # Test entity IOB stays consistent after merging words = ["abc", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]]) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "I"
def test_underscore_mutable_defaults_list(en_vocab): """Test that mutable default arguments are handled correctly (see #2581).""" Doc.set_extension("mutable", default=[]) doc1 = Doc(en_vocab, words=["one"]) doc2 = Doc(en_vocab, words=["two"]) doc1._.mutable.append("foo") assert len(doc1._.mutable) == 1 assert doc1._.mutable[0] == "foo" assert len(doc2._.mutable) == 0 doc1._.mutable = ["bar", "baz"] doc1._.mutable.append("foo") assert len(doc1._.mutable) == 3 assert len(doc2._.mutable) == 0
def test_sbd_serialization_projective(EN): """ test that before and after serialization, the sentence boundaries are the same. """ example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' ')) EN.tagger(example) apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) assert example.to_bytes() == example_serialized.to_bytes() assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) assert not doc.is_nered doc.ents = [Span(doc, 3, 5, label="GPE")] assert doc.is_nered # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) assert doc.is_nered # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.is_nered
def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" string = "This is a first sentence . And another one" doc = Doc(Vocab(), words=string.split()) doc[6].sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.is_parsed assert not new_doc.is_tagged doc.is_parsed = True doc.is_tagged = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc.is_parsed assert new_doc.is_tagged
def test_doc_retokenize_split_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] underscore = [{"a": True, "b": "1"}, {"b": "2"}] attrs = {"lemma": ["los", "angeles"], "_": underscore} retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].lemma_ == "los" assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1].lemma_ == "angeles" assert doc[1]._.a is False assert doc[1]._.b == "2"
def test_serialize_after_adding_entity(): # Re issue #514 vocab = spacy.en.English.Defaults.create_vocab() entity_recognizer = spacy.en.English.Defaults.create_entity() doc = Doc(vocab, words=u'This is a sentence about pasta .'.split()) entity_recognizer.add_label('Food') entity_recognizer(doc) label_id = vocab.strings[u'Food'] doc.ents = [(label_id, 5,6)] assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')] byte_string = doc.to_bytes()
def test_doc_retokenizer_split_lex_attrs(en_vocab): """Test that retokenization also sets attributes on the lexeme if they're lexical attributes. For example, if a user sets IS_STOP, it should mean that "all tokens with that lexeme" are marked as a stop word, so the ambiguity here is acceptable. Also see #2390. """ assert not Doc(en_vocab, words=["Los"])[0].is_stop assert not Doc(en_vocab, words=["Angeles"])[0].is_stop doc = Doc(en_vocab, words=["LosAngeles", "start"]) assert not doc[0].is_stop with doc.retokenize() as retokenizer: attrs = {"is_stop": [True, False]} heads = [(doc[0], 1), doc[1]] retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].is_stop assert not doc[1].is_stop
def main(output_dir=None): nlp = English() # start off with blank English class Doc.set_extension('overlap', method=overlap_tokens) doc1 = nlp(u"Peach emoji is where it has always been.") doc2 = nlp(u"Peach is the superior emoji.") print("Text 1:", doc1.text) print("Text 2:", doc2.text) print("Overlapping tokens:", doc1._.overlap(doc2)) Doc.set_extension('to_html', method=to_html) doc = nlp(u"This is a sentence about Apple.") # add entity manually for demo purposes, to make it work without a model doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] print("Text:", doc.text) doc._.to_html(output=output_dir, style='ent')
def test_issue599(en_vocab): doc = Doc(en_vocab) doc.is_tagged = True doc.is_parsed = True doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.is_parsed
def test_issue1799(): """Test sentence boundaries are deserialized correctly, even for non-projective sentences.""" heads_deps = numpy.asarray( [ [1, 397], [4, 436], [2, 426], [1, 402], [0, 8206900633647566924], [18446744073709551615, 440], [18446744073709551614, 442], ], dtype="uint64", ) doc = Doc(Vocab(), words="Just what I was looking for .".split()) doc.vocab.strings.add("ROOT") doc = doc.from_array([HEAD, DEP], heads_deps) assert len(list(doc.sents)) == 1
def test_serialize_doc_exclude(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc.user_data["foo"] = "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.user_data["foo"] == "bar" new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"]) assert not new_doc.user_data new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"])) assert not new_doc.user_data with pytest.raises(ValueError): doc.to_bytes(user_data=False) with pytest.raises(ValueError): Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
def test_doc_retokenize_merge_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) # Test regular merging with doc.retokenize() as retokenizer: attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}} retokenizer.merge(doc[0:2], attrs=attrs) assert doc[0].lemma_ == "hello world" assert doc[0]._.a is True assert doc[0]._.b == "1" # Test bulk merging doc = Doc(en_vocab, words=["hello", "world", "!", "!"]) with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}}) retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}}) assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1]._.a is None assert doc[1]._.b == "2"
def doc(vocab): return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."])
def test_issue2219(en_vocab): vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] add_vecs_to_vocab(en_vocab, vectors) [(word1, vec1), (word2, vec2)] = vectors doc = Doc(en_vocab, words=[word1, word2]) assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
def set_doc_extensions(): for attr, attr_info in _doc_extensions.items(): try: Doc.set_extension(attr, **attr_info) except ValueError as e: # If the attribute has already set, this will raise an error pass
from spacy.tokens import Doc, Span, Token from sentence_transformers import SentenceTransformer from . import util def get_vector(sent): doc = sent.doc model_name = doc._.sentence_bert_model_name model = SentenceBert.get_model(model_name) vector = model.encode([sent.text])[0] return vector # create an extension where the model will be used Doc.set_extension('sentence_bert_model_name', default=None, force=True) # set the extension both on doc and span level. This will contain the computed vector Token.set_extension('sentence_bert', getter=get_vector, force=True) Span.set_extension('sentence_bert', getter=get_vector, force=True) Doc.set_extension('sentence_bert', getter=get_vector, force=True) # the pipeline stage factory @Language.factory('sentence_bert', default_config={ 'model_name': None, 'debug': True }) def sentence_bert_factory(nlp, name, model_name, debug): if model_name:
import numpy as np import spacy from spacy.tokens import Doc, Span, Token from .base_parser import BaseParser, PTB_TOKEN_ESCAPE __all__ = ['BeneparComponent', 'NonConstituentException'] # None is not allowed as a default extension value! NOT_PARSED_SENTINEL = object() Doc.set_extension('_constituent_data', default=NOT_PARSED_SENTINEL) class NonConstituentException(Exception): pass #%% class ConstituentData(): def __init__(self, starts, ends, labels, loc_to_constituent, label_vocab): self.starts = starts self.ends = ends self.labels = labels self.loc_to_constituent = loc_to_constituent self.label_vocab = label_vocab class PartialConstituentData(): def __init__(self): self.starts = [np.array([], dtype=int)] self.ends = [np.array([], dtype=int)]
def doc_from_bytes(nlp, bytes): """Returns a serialised doc from the bytes coming from `doc.to_bytes()` """ doc = Doc(nlp.vocab).from_bytes(bytes) language.set_hooks(doc) return doc
def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[]) matcher(Doc(en_vocab, words=["test"]))
def test_matcher_no_match(matcher): doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."]) assert matcher(doc) == []
def test_matcher_match_start(matcher): doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"]) assert matcher(doc) == [(matcher.vocab.strings["JS"], 0, 1)]
def __call__(self, text): words = [f"{self.prefix}{word}" for word in text.split(" ")] return Doc(self.vocab, words=words)
def __init__(self, wikigraph: WikiGraph) -> None: Doc.set_extension("wiki_spans", default=[]) Span.set_extension("wiki_pages", default=[]) self._wg = wikigraph
def test_doc_scalar_attr_of_token(en_vocab): doc = Doc(en_vocab, words=["An", "example", "sentence"]) example = doc.vocab["example"] assert example.orth != example.shape feats_array = doc.to_array(ORTH) assert feats_array.shape == (3, )
def docs(vocab): return [ Doc(vocab, words=["hello", "world"]), Doc(vocab, words=["this", "is", "another"]), ]
def test_common_vocab_lex_attrs(NLP): doc = Doc(NLP.vocab, words=["Lorem", "IPSUM", "dolor", "."]) assert doc[0].is_title assert doc[1].is_upper assert doc[2].is_lower assert doc[3].is_punct
def __call__(self, text): words = text.split() # All tokens 'own' a subsequent space character in this tokenizer spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
def test_matcher_match_middle(matcher): words = ["I", "like", "Google", "Now", "best"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [(doc.vocab.strings["GoogleNow"], 2, 4)]
from itertools import combinations from spacy.tokens import Doc from spacy.tokens import Token from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES from text_complexity_analyzer_cm.utils.utils import split_doc_into_sentences Doc.set_extension('referential_cohesion_all', default=[], force=True) class ReferentialCohesionAllSentencesAnalyzer: name = 'referential cohesion all sentences analyzer' def __init__(self, language: str = 'es') -> None: ''' This constructor will initialize the object that processes referential cohesion for adjacent sentences in a text. It goes after sentencizer. Parameters: language: The language that this pipeline will be used in. Returns: None. ''' if not language in ACCEPTED_LANGUAGES: raise ValueError(f'Language {language} is not supported yet') self.language = language self.sentence_analyzer = None def __call__(self, doc: Doc) -> Doc: '''
def test_matcher_match_end(matcher): words = ["I", "like", "java"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [(doc.vocab.strings["Java"], 2, 3)]
def __call__(self): return Doc(self.vocab, words=self.all_input_tokens, spaces=self.all_spaces)
def test_issue1257(): """Test that tokens compare correctly.""" doc1 = Doc(Vocab(), words=["a", "b", "c"]) doc2 = Doc(Vocab(), words=["a", "c", "e"]) assert doc1[0] != doc2[0] assert not doc1[0] == doc2[0]
def __init__(self): """Initialise the pipeline component. """ if not Doc.has_extension("flesch_kincaid_grade_level"): Doc.set_extension("flesch_kincaid_grade_level", getter=self.fk_grade) if not Doc.has_extension("flesch_kincaid_reading_ease"): Doc.set_extension("flesch_kincaid_reading_ease", getter=self.fk_ease) if not Doc.has_extension("dale_chall"): Doc.set_extension("dale_chall", getter=self.dale_chall) if not Doc.has_extension("smog"): Doc.set_extension("smog", getter=self.smog) if not Doc.has_extension("coleman_liau_index"): Doc.set_extension("coleman_liau_index", getter=self.coleman_liau) if not Doc.has_extension("automated_readability_index"): Doc.set_extension("automated_readability_index", getter=self.ari) if not Doc.has_extension("forcast"): Doc.set_extension("forcast", getter=self.forcast)
def __call__(self,text): t=text.replace("\r","").replace("(","(").replace(")",")").replace("[","[").replace("]","]").replace("{","{").replace("}","}") u=self.model(t) if t else "" vs=self.vocab.strings r=vs.add("ROOT") words=[] lemmas=[] pos=[] tags=[] morphs=[] heads=[] deps=[] spaces=[] norms=[] ent_iobs=[] ent_types=[] bunsetu=[] for t in u.split("\n"): if t=="" or t.startswith("#"): continue s=t.split("\t") if len(s)!=10: continue id,form,lemma,upos,xpos,feats,head,deprel,_,misc=s words.append(form) lemmas.append(vs.add(lemma)) pos.append(vs.add(upos)) tags.append(vs.add(xpos)) morphs.append(feats) if deprel=="root": heads.append(0) deps.append(r) else: heads.append(int(head)-int(id)) deps.append(vs.add(deprel)) spaces.append(False if "SpaceAfter=No" in misc else True) i=misc.find("Translit=") norms.append(vs.add(form if i<0 else misc[i+9:])) i=misc.find("NE=") if i<0: ent_iobs.append(2) ent_types.append(0) else: j=misc.find("|",i) if j<0: j=len(misc) if misc[i+3:i+4]=="B": ent_iobs.append(3) else: ent_iobs.append(1) ent_types.append(vs.add(misc[i+5:j])) bunsetu.append("I") if misc.startswith("BunsetuBILabel="): bunsetu[-1]=misc[15:16] doc=Doc(self.vocab,words=words,spaces=spaces) a=numpy.array(list(zip(lemmas,pos,tags,deps,heads,norms,ent_iobs,ent_types)),dtype="uint64") doc.from_array([LEMMA,POS,TAG,DEP,HEAD,NORM,ENT_IOB,ENT_TYPE],a) try: doc.is_tagged=True doc.is_parsed=True except: for i,j in enumerate(morphs): if j!="_" and j!="": doc[i].set_morph(j) doc.user_data["bunsetu_bi_labels"]=bunsetu return doc
def test_serialize_doc_roundtrip_bytes(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc_b = doc.to_bytes() new_doc = Doc(en_vocab).from_bytes(doc_b) assert new_doc.to_bytes() == doc_b
def test_wp_start(wp_tokens, span, expected_start): doc = Doc(Vocab(), words=wp_tokens[1:-1]) doc._.pytt_word_pieces_ = wp_tokens doc._.pytt_alignment = align_word_pieces([w.text for w in doc], wp_tokens) assert doc[span]._.pytt_start == expected_start
from pprint import pprint import json import pickle from time import time import tablib from spacy.tokens import Token, Doc import role_pattern_nlp import db import util from config import config with open(config['pattern_eval_sheet'], 'rb') as f: pattern_eval_data = tablib.Dataset().load(f.read()) Token.set_extension('has_valence', default=False) Doc.set_extension('sentence_id', default=None) def pattern_fitness(pattern, matches, pos_matches, neg_matches): true_pos = [m for m in pos_matches if util.match_is_in_list(m, matches)] true_neg = [ m for m in neg_matches if not util.match_is_in_list(m, matches) ] false_pos = [m for m in neg_matches if util.match_is_in_list(m, matches)] false_neg = [ m for m in pos_matches if not util.match_is_in_list(m, matches) ] n_true_pos = len(true_pos) n_true_neg = len(true_neg) n_false_pos = len(false_pos) n_false_neg = len(false_neg)
def test_vectors_doc_vector(vocab, text): doc = Doc(vocab, words=text) assert list(doc.vector) assert doc.vector_norm
def __call__(self, text): words = text.rstrip().split(' ') spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
def test_vectors_span_vector(vocab, text): span = Doc(vocab, words=text)[0:2] assert list(span.vector) assert span.vector_norm
lines = "".join(["-" for i in range(len(string))]) print(lines) print(string) print(lines) learn_rules = rules[category] for it in range(0, iterations): """ ============================= FIND PHRASES BY RULES ============================= """ patterns = list() lt = LoopTimer(update_after=500, avg_length=10000, target=db_size) for abstract_id, row in infoDF.iterrows(): doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) patterns.extend( find_phrases_by_rule(doc, learn_rules, phrase_boundaries)) n = lt.update(f"Find Phrases - {len(patterns)}") print() """ ============================= BUILD MATCHER ============================= """ matcher = Matcher(vocab) lt = LoopTimer(update_after=10000, avg_length=10000, target=len(patterns)) for p_id, pattern in enumerate(patterns):