def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country)
def __init__(self, nlp, companies=tuple(), label='ORG'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(org) for org in companies] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('TECH_ORGS', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_tech_org', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension('has_tech_org', getter=self.has_tech_org)
def cohesive(self, *, punct: bool = False) -> Dict[str, int]: """Returns cohesive markers values. It process the document using a spaCy PhraseMatcher to finde the cohesive markers given in a list. Parameters: punct: bool - Flag to take into account punctuation. Returns: Dictionary with markers as keys and counts as values. """ doc = self.doc matcher = self.matcher features: DefaultDict[str, int] = defaultdict(int) matches = matcher(doc) spans = [Span(doc, start, end) for match_id, start, end in matches] if punct: spans = self._extended_spans(spans) for string in (span.text.lower() for span in spans): features[string] += 1 return dict(features)
def __call__(self, doc): labels = ["DATE", "GPE", "NORP"] previous_labels = [ "CAMP", "GHETTO", "DATE", "LOCATION", "NORP", "EVENT" ] l = [] for old_ent in doc.ents: if old_ent.label_ == "DATE": l.append(old_ent) elif old_ent.label_ == "GPE": new_ent = Span(doc, old_ent.start, old_ent.end, label="LOCATION") l.append(new_ent) elif old_ent.label_ == "NORP": l.append(old_ent) elif old_ent.label_ in previous_labels: l.append(old_ent) l = tuple(l) doc.ents = l return (doc)
def __call__(self, doc): matches = self.matcher(doc) spans = [ Span(doc, match[1], match[2], label=self.all_labels) for match in matches ] for i, span in enumerate(spans): span._.set("has_ontols", True) for token in span: if span.text.lower() in self.terms: token._.set("is_ontol_term", True) token._.set("ontol_id", self.terms[span.text.lower()]["id"]) else: print("Term not found: ", span.text.lower()) with doc.retokenize() as retokenizer: for span in filter_spans(spans): retokenizer.merge(span, attrs={"_": {"merged_concept": True}}) doc._.ontols = list(doc._.ontols) + [span] return doc
def pipe(self, docs: Iterable[Doc]) -> Iterable[Doc]: """Annotates the stream of documents based on the Spacy model""" stream1, stream2 = itertools.tee(docs, 2) # Remove existing entities from the document stream2 = (self.create_new_doc(d) for d in stream2) # And run the model for _, proc in self.model.pipeline: stream2 = proc.pipe(stream2) for doc, doc_copy in zip(stream1, stream2): doc.spans[self.name] = [] # Add the annotation for ent in doc_copy.ents: doc.spans[self.name].append( Span(doc, ent.start, ent.end, ent.label_)) yield doc
def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ matches = self.matcher(doc) spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in matches: # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity for token in entity: token._.set('is_tech_org', True) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc # don't forget to return the Doc!
def test_displacy_rtl(): # Source: http://www.sobhe.ir/hazm/ – is this correct? words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"] # These are (likely) wrong, but it's just for testing pos = ["PRO", "ADV", "N_PL", "V_SUB"] # needs to match lang.fa.tag_map deps = ["foo", "bar", "foo", "baz"] heads = [1, 0, 1, -2] nlp = Persian() doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) doc.ents = [Span(doc, 1, 3, label="TEST")] html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html assert 'direction="rtl"' in html assert 'lang="{}"'.format(nlp.lang) in html html = displacy.render(doc, page=True, style="ent") assert "direction: rtl" in html assert 'lang="{}"'.format(nlp.lang) in html
def test_history(nlp): text = re.sub("\\s+", " ", """This is a story about Pierre Lison and his work at Yetanothername Inc., which is just a name we invented. But of course, Lison did not really work for Yetanothername, because it is a fictious name, even when spelled like YETANOTHERNAME.""") doc = nlp(text) annotator1 = skweak.spacy.ModelAnnotator("spacy", "en_core_web_sm") annotator2 = skweak.doclevel.DocumentHistoryAnnotator("hist_cased", "spacy", ["PERSON", "ORG"]) annotator3 = skweak.doclevel.DocumentHistoryAnnotator("hist_uncased", "spacy", ["PERSON", "ORG"], case_sentitive=False) doc = annotator3(annotator2(annotator1(doc))) assert Span(doc, 5, 7, "PERSON") in doc.spans["spacy"] assert Span(doc, 11, 13, "ORG") in doc.spans["spacy"] assert Span(doc, 26, 27, "PERSON") in doc.spans["hist_cased"] assert Span(doc, 32, 33, "ORG") in doc.spans["hist_cased"] assert Span(doc, 32, 33, "ORG") in doc.spans["hist_uncased"] assert Span(doc, 45, 46, "ORG") in doc.spans["hist_uncased"]
def __add_annotation(self, df, col_text, current_index, annotations): """ Function to add annotations in spacy format a dataframe. Parameters ---------- df (pandas dataframe): Dataframe with text to be labelled. col_text (str): Column in pandas dataframe containing text to be labelled. current_index (int): Index of DataFrame row to annotate. annotations (dict): Dictionary containing strings of annotation patterns for PhraseMatcher. Returns ------- Labelled dataframe. """ spans = [] for label, items in annotations.items(): if items: item_list = [ i.strip() for i in items.split(self.delimiter) if i.strip() != "" ] matcher = PhraseMatcher(self.nlp.vocab, attr=self.attr) matcher.add(label, [self.nlp(item) for item in item_list]) doc = self.nlp(df[col_text][current_index]) matches = matcher(doc) spans_new = [] for match_id, start, end in matches: span = Span(doc, start, end, label="") spans_new.append(span) spans_filtered = spacy.util.filter_spans(spans_new) spans.extend([(span.start_char, span.end_char, label) for span in spans_filtered]) else: continue entities = {"entities": spans} df.at[current_index, "annotations"] = (df[col_text][current_index], entities)
def __call__(self, doc): """Call TargetMatcher on a doc. If `add_ents=True`, then matched spans will be merged in to doc.ents and `doc` will be returned. If `add_ents=False`, then matched spans will be returned as a list, in which case this cannot be used as part of a spaCy pipeline, which requires each component to return the doc, but can be used as a standalone matcher. In addition to extracting spans of text and setting labels, TargetRules can also define setting custom attributes and metadata. Additionally, each resulting span has an attribute span._.target_rule which maps a span to the TargetRule which set it. """ matches = self.matcher(doc) spans = [] for (rule_id, start, end) in matches: rule = self._rule_item_mapping[self.nlp.vocab.strings[rule_id]] span = Span(doc, start=start, end=end, label=rule.category) span._.target_rule = rule if rule.attributes is not None: for (attribute, value) in rule.attributes.items(): try: setattr(span._, attribute, value) except AttributeError as e: raise e spans.append(span) if self.add_ents is True: for span in spans: try: doc.ents += (span, ) # spaCy will raise a value error if the token in span are already # part of an entity (ie., as part of an upstream component) # In that case, let the existing span supersede this one except ValueError as e: # raise e pass return doc else: return spans
def __call__(self, doc: Doc) -> Doc: """Aggregates all weak supervision sources""" if len(doc.spans) > 0: # Extracting the observation data df = self.get_observation_df(doc) # Running the actual aggregation agg_df = self._aggregate(df) if "O" in self.out_labels: # Converting back to spans or token labels output_spans = utils.token_array_to_spans( agg_df.values, self.out_labels) output_probs = utils.token_array_to_probs( agg_df.values, self.out_labels) else: output_spans = agg_df.idxmax(axis=1).to_dict() output_probs = { span: { label: prob for label, prob in distrib.items() if prob > 0.1 } for span, distrib in agg_df.to_dict( orient="index").items() } # Storing the results (both as spans and with the full probs) doc.spans[self.name] = [ Span(doc, start, end, label=label) for (start, end), label in output_spans.items() ] doc.spans[self.name].attrs["probs"] = output_probs doc.spans[self.name].attrs["aggregated"] = True doc.spans[self.name].attrs["sources"] = list(df.columns) return doc
def main(): nlp = spacy.load('en_core_web_sm') # create lists for building a doc print('Create Doc #1: -------------------------') words = ['spaCy', 'is', 'useful', 'for', 'NLP', '!'] spaces = [True, True, True, True, False, False] # create a doc from the words and spaces lists # and pass in the vocab doc = Doc(nlp.vocab, words=words, spaces=spaces) print(doc.text) print('\n') # create lists for building another doc print('Create Doc #2: -------------------------') words2 = ['Jazz', 'Winston', 'is', 'the', 'best', '!'] spaces2 = [True, True, True, True, False, False] # create a doc from the words and spaces lists # and pass in the vocab doc2 = Doc(nlp.vocab, words=words2, spaces=spaces2) print(doc2.text) print('\n') # create a span for 'Jazz Winston' from the doc # and assign it the label 'DOG' span = Span(doc2, 0, 2, label='DOG') print(span.text, span.label_) # add the span to the doc's entities doc2.ents = [span] # print entities' text and labels print([(ent.text, ent.label_) for ent in doc2.ents]) # end program print('\nDone.')
def test_issue4042_bug2(): """ Test that serialization of an NER works fine when new labels were added. This is the second bug of two bugs underlying the issue 4042. """ nlp1 = English() vocab = nlp1.vocab # add ner pipe ner1 = nlp1.create_pipe("ner") ner1.add_label("SOME_LABEL") nlp1.add_pipe(ner1) nlp1.begin_training() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 assert "SOME_LABEL" in ner1.labels apple_ent = Span(doc1, 5, 6, label="MY_ORG") doc1.ents = list(doc1.ents) + [apple_ent] # reapply the NER - at this point it should resize itself ner1(doc1) assert len(ner1.labels) == 2 assert "SOME_LABEL" in ner1.labels assert "MY_ORG" in ner1.labels with make_tempdir() as d: # assert IO goes fine output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) nlp2 = English(vocab) ner2 = EntityRecognizer(vocab) ner2.from_disk(output_dir) assert len(ner2.labels) == 2
def test_matcher_sets_return_correct_tokens(matcher): patterns = [ [{ "LOWER": { "IN": ["zero"] } }], [{ "LOWER": { "IN": ["one"] } }], [{ "LOWER": { "IN": ["two"] } }], ] matcher.add("TEST", patterns) doc = Doc(matcher.vocab, words="zero one two three".split()) matches = matcher(doc) texts = [Span(doc, s, e, label=L).text for L, s, e in matches] assert texts == ["zero", "one", "two"]
def merge_entities(self, e1, e2, doc, keep_first_label=True): """ Take two neighbouring entities and merge them into one span (almost an entity). e1 : first entity (nlp(text).ents[1]) e2 : second entity keep_first_label : if we keep as label the one of the first entity, otherwise the second """ if keep_first_label: #consider, from looking at French, that the label of the reunion of the entities separated by an ' #is the label of the first one : this is why it's default new_label_ = e2.label_ #(unicode) #new_label = e2.label # get hash value of entity label (int) else: new_label_ = e1.label_ #new_label = e1.label #create a Span with the start and end index of the token, not the start and end index of the entity in the document #start and end are the token offset, while start_char and end_char are the character offset start_token = e1.start end_token = e2.end new_entity = Span(doc, start_token, end_token, label_=new_label_) return new_entity
def tokensfromdoc(doc): d = nlp(doc) matches = matcher(d) for match_id, start, end in matches: term = Span(d, start, end, label='myterms') d.ents = list(d.ents) + [term] tokens = [w.lemma_ for w in d # no pronouns if w.pos_ != 'PRON' \ # no punctuations and w.pos_ != 'PUNCT' \ # not Beginning of a named entity and w.ent_iob_ != 'B' \ # not Inside a named entity and w.ent_iob_ != 'I' \ # not a stop word and not w.is_stop] tokens += [de.string.rstrip() for de in d.ents] return tokens
def crecord2spans(cr, nlp, ntr): """ Get token offsets using char offsets and merge entity annotations in Doc with spacy's using ner_type_resolver (ntr) :param cr: RelationRecord :param nlp: spacy model :param ntr: NERTypeResolver :return: Tuple[spacy.token.Span, spacy.token.Span] """ doc = nlp(cr.context) spans = chars2spans(doc, cr.s_spanr, cr.o_spanr) uris = [cr.subject, cr.object] _v_ = nlp.vocab.strings true_ents = [] for span, uri in zip(spans, uris): label_str = ntr.get_by_uri(uri, default_type=span.label_) # fallback to spacy's ner tag if not label_str in _v_: log.info('crecord2spans: unknown entity type: "{}"'.format(label_str)) label_id = _v_[label_str] # modify StringStore if necessary true_ents.append(Span(doc=doc, start=span.start, end=span.end, label=label_id)) corrected_ents = merge_ents_offsets(true_ents, doc.ents) doc.ents = corrected_ents return true_ents
def bunsetu_span(token: Token) -> Span: bunsetu_bi_list = bunsetu_bi_labels(token.doc) start = token.i end = start + 1 for idx in range(start, 0, -1): if bunsetu_bi_list[idx] == "B": start = idx break else: start = 0 doc_len = len(token.doc) for idx in range(end, doc_len): if bunsetu_bi_list[idx] == "B": end = idx break else: end = doc_len doc = token.doc return Span(doc, start=start, end=end, label=POS_PHRASE_MAP.get(doc[start:end].root.pos_, ""))
def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. :param doc: text to be analysed :return: text updated with the tags and the entities matched """ matches = self.matcher(doc) spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in matches: # Generate Span representing the entity and set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity for token in entity: token._.set('is_metro_fault', True) # Overwrite doc.ents and add entity doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. span.merge() return doc
def Entity(): print("\nThe outcomes of Entity Extraction are:") doc = nlp(u"京东CEO刘强东在美国明尼苏达涉嫌性侵女大学生。") for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) from spacy.tokens import Span doc = nlp(u"奶茶妹妹遇见VP就有90%的几率1位出道……") for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) augment = [Span(doc, 0, 1, label=doc.vocab.strings[u'WORK_OF_ART'])] doc.ents = list(doc.ents) + augment for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_))
def __call__(self, doc, entities): spans = [] matches = self.matcher(doc) for label, start, end in matches: span = doc[start:end] if (span[0].ent_type == label): # if not already labeled continue # es only: try extending the match (compound) compound_expanded = False if (span.root.lang_ == 'es'): span = self._try_expand_compound(span, label) if span: spans.append(span) compound_expanded = True if (False == compound_expanded): span = Span(doc, start, end, label=label) spans.append(span) if (len(spans) > 0): doc.ents = list(doc.ents) + spans return doc
def ingest_json_document(doc_json: Mapping, nlp: Language) -> Doc: doc = nlp(doc_json["text"]) token_idx_list = [token.idx for token in doc] annotations = sorted(doc_json["labels"], key=lambda x: x[0]) if len(annotations) == 0 and doc_json["annotation_approver"] is None: # raise ValueError pass first = True ents = [] for annotation in annotations: character_start, character_end, label = annotation ent_start = bisect_right(token_idx_list, character_start) - 1 ent_end = bisect_left(token_idx_list, character_end) if first: first = False else: ent_start = max(ent_start, ents[-1].end) # if ent_start >= ent_end: # raise ValueError if ent_start < ent_end: ents.append(Span(doc, ent_start, ent_end, label)) doc.ents = ents return doc
def __call__(self, doc): doc_low = Doc(self.nlp.vocab, words=[t.lower_ for t in doc], spaces=[t.whitespace_ for t in doc]) #print ("doc_low:", doc_low) # matches = self.matcher(doc_low) # print ("matches1:", matches) # # filter matches for overlaps (keep longest span) # negation_terms = self.filter_matches(matches) # print ("negation terms1:", negation_terms) #POS trigger matches = self.matcher1(doc_low) # filter matches for overlaps (keep longest span) negation_terms = self.filter_matches(matches) #PRE triger matches2 = self.matcher2(doc_low) # filter matches for overlaps (keep longest span) negation_terms2 = self.filter_matches(matches2) negation_terms += negation_terms2 doc._.negs = [ Span(doc, start, end, label=rule_tag) for rule_tag, start, end in negation_terms ] #print (">>>", doc._.negs) for neg_span in doc._.negs: for token in neg_span: token._.negation_type_ = neg_span.label self.compute_negations(negation_terms, doc) return doc
def evaluate(ner_model, examples): scorer = Scorer() stopper = 0 for input, annot in examples: doc_gold_text = ner_model.make_doc(input) doc = ner_model(input) arrayOFEntities = [(ent.text, ent.label_) for ent in doc.ents] gold = GoldParse(doc_gold_text, entities=annot) print("entities is ", arrayOFEntities) print("sentence", input) print("annot is", annot) #for first,last,entity in annot: #print("word is",input[int(first):int(last)]) #print("gold ner ",gold.ner) #modifiedGoldNer(input,arrayOFEntities,gold.ner) #print("gold ner after modified ",gold.ner) stopper += 1 pred_value = ner_model(input) print(type(pred_value.ents)) span = Span(pred_value, 20, 21, 0) print("span type", type(span), span.label_) for ent in pred_value.ents: print("pred value is", type(ent)) print(ent.text) print(ent.text, "after modified") print(ent.label_) print(ent.start) print(ent.end) #print(pred_value[ent.start:ent.end]) #pred_value.ents=span print("after modification", pred_value.ents) print("pred_value", pred_value) scorer.score(pred_value, gold) return scorer.scores
def ingest_json_document(doc_json: Mapping, nlp: Language, include_other: bool, is_predict=False) -> Doc: if is_predict: doc = nlp(doc_json["title"] + "\n" + doc_json["text"]) doc.user_data["date"] = doc_json["date"] doc.user_data["url"] = doc_json["url"] doc.user_data["newspaper"] = doc_json["newspaper"] doc.user_data["categoria"] = doc_json["categoria"] doc.ents = [] return doc else: if not doc_json["annotation_approver"] and not doc_json["labels"]: raise ValueError("Instance is not annotated!") else: doc = nlp(doc_json["text"]) spans = list() #print(doc_json) for label in doc_json["labels"]: #print(doc_json["text"]) if include_other or label[2] != "OTHER": if doc_json["annotation_approver"] != "lazaro": start_char = label[0] end_char = label[1] tag = label[2] token_start = get_starting_token(start_char, doc) token_end = get_ending_token(end_char, doc) else: token_start = label[0] token_end = label[1] tag = label[2] if token_start is None or token_end is None: raise ValueError("Token alignment impossible!") spans.append(Span(doc, token_start, token_end, tag)) doc.ents = spans return doc
def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ ruler = self.ruler(doc) # execute the ruler spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in self.ruler.matcher(doc): # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity # Can be extended with other data returned by the API, like # lei_code, country for token in entity: token._.set("is_lei", True) with doc.retokenize() as retokenizer: # Iterate over all spans en merge spans of multiple tokens into one single token for span in spans: retokenizer.merge(span) return doc
def build_docx(lines: List[str], tti: TaxonomyTokenIdentify): # nlp = tti.nlp matcher = tti.get_phrase_matcher() docx = tti.spacify_text('\n'.join(lines), False) # No fuzzy matching matches = matcher(docx) match_spans = [] for match_id, start, end in matches: try: # rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR' # span = docx[start: end] # get the matched slice of the doc # print(rule_id, span.text) # create a new Span for each match and use the match_id (ANIMAL) as the label span = Span(docx, start, end, label=match_id) match_spans.append(span) except Exception as ee: print(match_id, docx[start:end], ee) docx.ents = set(list(docx.ents)) | set(filter_spans(match_spans)) return docx
def test_issue4313(): """ This should not crash or exit with some strange error code """ beam_width = 16 beam_density = 0.0001 nlp = English() config = { "beam_width": beam_width, "beam_density": beam_density, } ner = nlp.add_pipe("beam_ner", config=config) ner.add_label("SOME_LABEL") nlp.initialize() # add a new label to the doc doc = nlp("What do you think about Apple ?") assert len(ner.labels) == 1 assert "SOME_LABEL" in ner.labels apple_ent = Span(doc, 5, 6, label="MY_ORG") doc.ents = list(doc.ents) + [apple_ent] # ensure the beam_parse still works with the new label docs = [doc] ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density) assert len(ner.labels) == 2 assert "MY_ORG" in ner.labels
def __call__(self, doc: Doc) -> Doc: """ Annotate the document with noun phrase spans """ spans = [] doc_vecs = [] doc_chars = [] doc_lens = [] if len(doc) < 1: return doc for sentence in doc.sents: features = self._feature_extractor([t.text for t in sentence]) if isinstance(features, tuple): doc_vec = features[0] doc_chars.append(features[1]) else: doc_vec = features doc_vecs.append(doc_vec) doc_lens.append(len(doc_vec)) doc_vectors = pad_sentences(np.asarray(doc_vecs)) inputs = doc_vectors if self.char_vocab: max_len = doc_vectors.shape[1] padded_chars = np.zeros( (len(doc_chars), max_len, self.model.max_word_len)) for idx, d in enumerate(doc_chars): d = d[:max_len] padded_chars[idx, -d.shape[0]:] = d inputs = [inputs, padded_chars] np_indexes = self._infer_chunks(inputs, doc_lens) for s, e in np_indexes: np_span = Span(doc, s, e) spans.append(np_span) spans = _NPPostprocessor.process(spans) set_noun_phrases(doc, spans) return doc
def test_matcher_sets_return_correct_tokens(en_vocab): matcher = Matcher(en_vocab) patterns = [ [{ 'LOWER': { 'IN': ["zero"] } }], [{ 'LOWER': { 'IN': ["one"] } }], [{ 'LOWER': { 'IN': ["two"] } }], ] matcher.add('TEST', None, *patterns) doc = Doc(en_vocab, words="zero one two three".split()) matches = matcher(doc) texts = [Span(doc, s, e, label=L).text for L, s, e in matches] assert texts == ['zero', 'one', 'two']
def test_get_spans(nlp_small): doc = nlp_small( "This is just a small test for checking that the method works correctly" ) doc.spans["source1"] = [ Span(doc, 0, 2, label="LABEL1"), Span(doc, 4, 5, label="LABEL2") ] doc.spans["source2"] = [ Span(doc, 0, 1, label="LABEL3"), Span(doc, 2, 6, label="LABEL2") ] doc.spans["source4"] = [Span(doc, 0, 2, label="LABEL2")] doc.spans["source3"] = [ Span(doc, 7, 9, label="LABEL2"), Span(doc, 1, 4, label="LABEL1") ] assert set( (span.start, span.end) for span in utils.get_spans(doc, ["source1", "source2"])) == {(0, 2), (2, 6)} assert set( (span.start, span.end) for span in utils.get_spans(doc, ["source1", "source3"])) == {(1, 4), (4, 5), (7, 9)} assert {(span.start, span.end): span.label_ for span in utils.get_spans(doc, ["source1", "source4"])} == { (0, 2): "LABEL2", (4, 5): "LABEL2" } assert set( (span.start, span.end) for span in utils.get_spans(doc, ["source2", "source3"])) == {(0, 1), (2, 6), (7, 9)}
def test_span_kb_id_readonly(doc): span = Span(doc, 0, 1) with pytest.raises(NotImplementedError): span.kb_id_ = "Q342"
def test_span_label_readonly(doc): span = Span(doc, 0, 1) with pytest.raises(NotImplementedError): span.label_ = "hello"