def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country)
def __init__(self, nlp, companies=tuple(), label='ORG'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(org) for org in companies] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('TECH_ORGS', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_tech_org', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension('has_tech_org', getter=self.has_tech_org)
def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs): """Instantiate SpacyQuickUMLS object This creates a QuickUMLS spaCy component which can be used in modular pipelines. This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts. Args: nlp: Existing spaCy pipeline. This is needed to update the vocabulary with UMLS CUI values quickumls_fp (str): Path to QuickUMLS data best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True. ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py) """ self.quickumls = QuickUMLS( quickumls_fp, # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed # when we're using it as a component in a pipeline spacy_component=True, **kwargs) # save this off so that we can get vocab values of labels later self.nlp = nlp # keep these for matching self.best_match = best_match self.ignore_syntax = ignore_syntax # let's extend this with some proprties that we want Span.set_extension('similarity', default=-1.0) Span.set_extension('semtypes', default=-1.0)
def __init__(self, nlp, attrs=('has_nfh', 'is_nfh', 'nfh', 'is_deter_nfh', 'nfh_head', 'is_implicit'), force_extension=True): """Initialise the pipeline component. nlp (Language): The shared nlp object. Used to initialise the matcher with the shared `Vocab`, and create `Doc` match patterns. RETURNS (callable): A spaCy pipeline component. """ download_models() home = path.expanduser("~") with open(path.join(home, NFH_DIR, IDENTIFICATION_NFH), 'rb') as f: self.identification = pickle.load(f) self.feature_extractor = FeatureExtractor(3) archive_model = load_archive(path.join(home, NFH_DIR, RESOLUTION_NFH)) self.resolution_predictor = Predictor.from_archive( archive_model, 'nfh_classification') self._has_nfh, self._is_nfh, self._nfh, self._is_deter_nfh, \ self._nfh_head, self._is_implicit = attrs self._nfh_items = 'nfh_items' # Add attributes Doc.set_extension(self._has_nfh, getter=self.has_nfh, force=force_extension) Span.set_extension(self._has_nfh, getter=self.has_nfh, force=force_extension) Doc.set_extension(self._nfh, getter=self.iter_nfh, force=force_extension) Span.set_extension(self._nfh, getter=self.iter_nfh, force=force_extension) Span.set_extension(self._is_nfh, default=False, force=force_extension) Token.set_extension(self._is_nfh, default=False, force=force_extension) Token.set_extension(self._is_deter_nfh, default=False, force=force_extension) Token.set_extension(self._nfh_head, default=None, force=force_extension) Token.set_extension(self._is_implicit, default=False, force=force_extension) Doc.set_extension(self._nfh_items, default=[], force=force_extension)
def install_extensions(): K = KNP_USER_KEYS Token.set_extension(K.morph.element, default=None, force=True) for k in [ K.bunsetsu.element, K.tag.element, K.bunsetsu.list_, K.morph.list_, K.tag.list_, ]: Span.set_extension(k, default=None, force=True) for k in [BUNSETSU, TAG]: Span.set_extension(getattr(KNP_USER_KEYS, k).spans, getter=get_knp_span(k)) Span.set_extension(getattr(KNP_USER_KEYS, k).parent, getter=get_knp_parent(k)) Span.set_extension(getattr(KNP_USER_KEYS, k).children, getter=get_knp_children(k))
def _install_extensions(): K = KNP_USER_KEYS Token.set_extension(K.morph.element, default=None, force=True) for k in ["bunsetsu", "tag"]: Token.set_extension(getattr(K.morph, k), getter=token_to_knp_span(k)) for k in ["bunsetsu", "morph", "tag"]: for feature in ["element", "list_"]: key = getattr(getattr(K, k), feature) Span.set_extension(key, default=None, force=True) for k in ["bunsetsu", "morph", "tag"]: for feature in ["spans", "list_"]: key = getattr(getattr(K, k), feature) Doc.set_extension(key, getter=get_all_knp_features_from_sents(k, feature)) for k in [BUNSETSU, TAG]: Span.set_extension(getattr(KNP_USER_KEYS, k).spans, getter=get_knp_span(k)) Span.set_extension(getattr(KNP_USER_KEYS, k).parent, getter=get_knp_parent(k)) Span.set_extension( getattr(KNP_USER_KEYS, k).children, getter=get_knp_children(k) )
def __init__(self, spacy_instance, stop_words=None): self.nlp = spacy_instance self.stop_words = spacy_instance.Defaults.stop_words if stop_words is None else stop_words self.whitelist_words = {'pajamas'} self.whitelist_grammar = {'UPPERCASE_SENTENCE_START'} self.spell = SpellChecker() Span.set_extension('has_grammar_errors', default=False, force=True) Span.set_extension('grammar_recommendation', default=[], force=True) Span.set_extension('has_spelling_errors', default=False, force=True) Token.set_extension('correct_spelling_candidates', default=[], force=True)
def __init__(self, nlp, keywords, label, tokentag, doctag=None, spantag=None): nlp.vocab.strings.add(label) self.label = nlp.vocab.strings[label] self._label_str = label self._token_tag = tokentag self._doctag = doctag self._spantag = spantag self._keywordtag = "is_keyword" self._labeltag = "label_" # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(key) for key in keywords] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(self._token_tag, None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension(self._token_tag, default=False) if not Token.has_extension(self._keywordtag): Token.set_extension(self._keywordtag, default=False) Token.set_extension(self._labeltag, default=None) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension(self._doctag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) Span.set_extension(self._spantag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) if not Span.has_extension("dep_"): Span.set_extension("dep_", default="") Span.set_extension("head_", default=None)
from spacy.lang.ja import Japanese from spacy.tokens import Span nlp = Japanese() # メソッドを定義 def to_html(span, tag): # スパンのテキストをHTMLタグに入れて返す return f"<{tag}>{span.text}</{tag}>" # to_htmlをスパンの「to_html」拡張属性に登録 Span.set_extension("to_html", method=to_html) # テキストを処理し、「strong」タグを用いてスパンのto_htmlメソッドを呼びだす doc = nlp("おはようございます、 これは文章です。") span = doc[0:3] print(span._.to_html("strong"))
# most of the code in this file has been written by Christian Overdijk. import spacy from spacy.matcher import Matcher #from import_data_json import import_data_json from spacy.tokens import Span def entity_detector(doc): entity_list = [] for d in doc: if d.text[:3] == 'LOC': entity_list.append(d) return entity_list Span.set_extension("entities", method=entity_detector, force=True) # Takes data as pandas DataFrame (1 column) # Needs trained nlp model (spaCy) # Word_distane = number of tokens before and after found entities that are included with result # Max_token_dist = maximum number op tokens between enities in one sentence before discriptions are considered different ans split # Returns: list of result split per input, and than per sentence def get_location_descriptions_json(data, nlpmodel, word_dist=7, max_token_dist=14): results = [] for article in data: article_results = [] doc = nlpmodel(article) for sent in doc.sents: sentence_results = []
position -= 1 while position >= 0: start = constituent_data.starts[position] end = constituent_data.ends[position] if start <= span.start and span.end <= end: return doc[start:end] if end < span.sent.start: break position -= 1 return None #%% Span.set_extension('labels', getter=get_labels) Span.set_extension('parse_string', getter=parse_string) Span.set_extension('constituents', getter=get_subconstituents) Span.set_extension('parent', getter=get_parent_span) Span.set_extension('children', getter=get_child_spans) Token.set_extension( 'labels', getter=lambda token: get_labels(token.doc[token.i:token.i + 1])) Token.set_extension( 'parse_string', getter=lambda token: parse_string(token.doc[token.i:token.i + 1])) Token.set_extension( 'parent', getter=lambda token: get_parent_span(token.doc[token.i:token.i + 1]))
import spacy from spacy.tokens import Span nlp = spacy.load("de_core_news_sm") def get_wikipedia_url(span): # Generiere eine Wikipedia-URL, wenn die Span eins der Labels hat if span.label_ in ("PER", "ORG", "LOC"): entity_text = span.text.replace(" ", "_") return "https://de.wikipedia.org/w/index.php?search=" + entity_text # Registriere die Span-Erweiterung "wikipedia_url" mit Getter-Funktion get_wikipedia_url Span.set_extension("wikipedia_url", getter=get_wikipedia_url) doc = nlp( "In seiner mehr als fünfzigjährigen Karriere und von seinen ersten Aufnahmen " "bis hin zu seinem letzten Album, gehörte David Bowie zu den Vorreitern der " "Gegenwartskultur.") for ent in doc.ents: # Drucke den Text und die Wikipedia-URL der Entität print(ent.text, ent._.wikipedia_url)
# if token_span_doc.vocab.has_vector(token_span_doc.text): # return token_span_doc.vocab.get_vector(token_span_doc.text) doc = token_span_doc.doc use_model_url = doc._.use_model_url preprocessor_url = doc._.preprocessor_url # if not use_model_url: model = UniversalSentenceEncoder.get_model(use_model_url, preprocessor_url) vector = model.embed_one(token_span_doc) return vector # install/register the extensions Doc.set_extension('use_model_url', default=None, force=True) Doc.set_extension('preprocessor_url', default=None, force=True) Token.set_extension('universal_sentence_encoding', getter=get_vector, force=True) Span.set_extension('universal_sentence_encoding', getter=get_vector, force=True) Doc.set_extension('universal_sentence_encoding', getter=get_vector, force=True) # the pipeline stage factory @Language.factory('universal_sentence_encoder', default_config={ 'use_model_url': None, 'preprocessor_url': None, 'model_name': None, 'enable_cache': True, 'debug': False }) def use_model_factory(nlp, name, use_model_url, preprocessor_url, model_name, enable_cache, debug): preprocessor_url_config = None if debug: print('use_model_factory:', nlp, 'use_model_url', use_model_url, 'model_name', model_name) if use_model_url:
from spacy.tokens import Span def get_anatomical_location(span): for modifier in span._.modifiers: if modifier.category == "ANATOMY": return modifier.span return None Span.set_extension("anatomical_location", getter=get_anatomical_location)
nat.append(ent._.nationality) return nat def extract_foreign(doc): is_foreign = [] for ent in doc.ents: if ent._.travel_status: is_foreign.append({ "place": acronym_to_country(ent.text), "is_foreign": not (ent.text in l), }) return is_foreign Span.set_extension("travel_status", getter=get_travel_status, force=True) Span.set_extension("nationality", getter=get_nat, force=True) Token.set_extension("relationship", getter=get_rel, force=True) app = Flask(__name__) default_result = { "nationality": [], "travel": [], "relationship": [], "place_attributes": [], } @functools.lru_cache(30000) def record_processor(sent):
def __init__(self, nlp, name="my_pipe"): self.name = name Span.set_extension("my_ext", getter=self._get_my_ext) Doc.set_extension("my_ext", default=None)
#Define the getter function def get_has_number(doc): #Return if any of the tokens in the doc return True for token.like_num return any(token.like_num for token in doc) #Register the Doc property extension 'has_number' with the getter get_has_number Doc.set_extension('has_number', getter=get_has_number) #Process the text and check the custom has_number attribute doc = nlp("The museum closed for five years in 2012.") print('has_number:', doc._.has_number) #Part 2 from spacy.tokens import Span #Define the method def to_html(span, tag): #Wrap the span text in a HTML tag and return it return "<{tag}>{text}</{tag}>".format(tag=tag, text=span.text) #Register the Span property extension 'to_html' with the method to_html Span.set_extension('to_html', method=to_html) #Process the text and call the to_html method on the span with the tag name 'strong' doc = nlp("Hello world, this is a sentence.") span = doc[0:2] print(span._.to_html('strong'))
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple import spacy from spacy.tokens import Doc, Span, Token from spacy.util import filter_spans from toolz import curry from typing_extensions import Literal from camphr.consts import JUMAN_LINES from camphr.utils import get_juman_command from .consts import KNP_USER_KEYS from .noun_chunker import knp_noun_chunker LOC2IOB = {"B": "B", "I": "I", "E": "I", "S": "B"} Span.set_extension(JUMAN_LINES, default=None) SKIP_TOKENS = {"@"} TAG = "tag" BUNSETSU = "bunsetsu" MORPH = "morph" L_KNP_OBJ = Literal["tag", "bunsetsu", "morph"] def _take_juman_lines(n: int, juman_lines: List[str]) -> Tuple[List[str], List[str]]: lines = [] count = 0 for line in juman_lines: lines.append(line) head = line.split(" ")[0]
if doc[matches[0][1] + 1].text == "hoogte": # print("------------------------------------------------") # print(doc[matches[0][1]+1], " : ", doc[matches[0][1]+1].pos_) doc[matches[0][1] + 1].pos_ = "ADP" # print(doc[matches[0][1]+1], " : ", doc[matches[0][1]+1].pos_) def delete_match(matcher, doc, id, matches): if doc[matches[0][1]].text == "van" or doc[matches[0][1]].text == "met": # print("------------------------------------------------") # print(doc[matches[0][1]], " : ", doc[matches[0][1]].pos_) doc[matches[0][1]].pos_ = "X" # print(doc[matches[0][1]], " : ", doc[matches[0][1]].pos_) Span.set_extension("entities", default=[]) # Takes data as pandas DataFrame (1 column) # Needs trained nlp model (spaCy) # Word_distane = number of tokens before and after found entities that are included with result # Max_token_dist = maximum number op tokens between enities in one sentence before discriptions are considered different ans split # Returns: list of result split per input, and than per sentence def get_location_descriptions(data, nlpmodel, word_dist=4, max_token_dist=8, entity_filter=ENTITY_LIST): results = [] for article in data:
for match_id, start, end in matcher(doc) ] return doc def get_pi_url(span): """Get a URL for PI if the span has one of the labels""" if span.label_ in 'DRUG': entity_text = span.text.replace(' ', '_') url = "https://www.ebs.tga.gov.au/ebs/picmi/picmirepository.nsf/PICMI?OpenForm&t=pi&q=" + entity_text web_list.append(url) return url # Set the Span extension pi_url using get getter get_pi_url Span.set_extension('PI_url', getter=get_pi_url, force=True) # Add the component to the pipeline nlp.add_pipe(drug_component) print(nlp.pipe_names) # Make the sentence lowercase, process the text and print the entity text, label and PI_url attributes doc = nlp(sentence.lower()) print([(ent.text, ent.label_, ent._.PI_url) for ent in doc.ents]) for web in list(set(web_list)): # Note that I used 'set' here so that there are no repeats of urls # (e.g. when drugs are mentioned more than once in the text) in the 'web_list' list try: # Get the chrome driver location chrome_path = r"C:\Users\Andrew\path_to\chromedriver.exe" # add chrome_path to webdriver
def parse_conll_text_as_spacy( self, text: str, ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$", ner_map: Dict[str, str] = None, ) -> Doc: """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n). Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #) is saved in Span._.conll_metadata of sentence Spans. This method has been adapted from the work by spaCy. See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179 Multi-word tokens and empty nodes are not supported. :param text: CoNLL-U formatted text :param ner_tag_pattern: Regex pattern for entity tag in the MISC field :param ner_map: Map old NER tag names to new ones, '' maps to O :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including the custom CoNLL extensions """ if not Token.has_extension("conll_misc_field"): Token.set_extension("conll_misc_field", default="_") if not Token.has_extension("conll_deps_graphs_field"): Token.set_extension("conll_deps_graphs_field", default="_") if not Span.has_extension("conll_metadata"): Span.set_extension("conll_metadata", default=None) docs = [] for chunk in text.split("\n\n"): lines = [ l for l in chunk.splitlines() if l and not l.startswith("#") ] words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], [] heads, deps, deps_graphs = [], [], [] for i in range(len(lines)): line = lines[i] parts = line.split("\t") if any(not p for p in parts): raise ValueError( "According to the CoNLL-U Format, fields cannot be empty. See" " https://universaldependencies.org/format.html") id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts if any(" " in f for f in (id_, pos, tag, morph, head, dep, deps_graph)): raise ValueError( "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain" " spaces. See https://universaldependencies.org/format.html" ) if "." in id_ or "-" in id_: raise NotImplementedError( "Multi-word tokens and empty nodes are not supported in spacy_conll" ) words.append(word) if "SpaceAfter=No" in misc: spaces.append(False) else: spaces.append(True) id_ = int(id_) - 1 lemmas.append(lemma) poses.append(pos) tags.append(pos if tag == "_" else tag) morphs.append(morph if morph != "_" else "") heads.append((int(head) - 1) if head not in ("0", "_") else id_) deps.append("ROOT" if dep == "root" else dep) deps_graphs.append(deps_graph) miscs.append(misc) doc = Doc( self.nlp.vocab, words=words, spaces=spaces, tags=tags, pos=poses, morphs=morphs, lemmas=lemmas, heads=heads, deps=deps, ) # Set custom Token extensions for i in range(len(doc)): doc[i]._.conll_misc_field = miscs[i] doc[i]._.conll_deps_graphs_field = deps_graphs[i] ents = get_entities(lines, ner_tag_pattern, ner_map) doc.ents = spans_from_biluo_tags(doc, ents) # The deprel relations ensure that this CoNLL chunk is one sentence # Deprel cannot therefore not be empty or each word is considered a separate sentence if len(list(doc.sents)) != 1: raise ValueError( "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format" " requirements. See https://universaldependencies.org/format.html. Particularly make" " sure that the DEPREL field is filled in.") # Save the metadata in a custom sentence Span attribute so that the formatter can use it metadata = "\n".join( [l for l in chunk.splitlines() if l.startswith("#")]) # We really only expect one sentence for sent in doc.sents: sent._.conll_metadata = f"{metadata}\n" if metadata else "" docs.append(doc) # Add CoNLL custom extensions return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))
urls= ['http://www.gutenberg.org/cache/epub/29444/pg29444.txt', 'http://www.gutenberg.org/cache/epub/31428/pg31428.txt', 'http://www.gutenberg.org/cache/epub/4908/pg4908.txt', 'http://www.gutenberg.org/cache/epub/33504/pg33504.txt', 'http://www.gutenberg.org/cache/epub/39017/pg39017.txt', 'http://www.gutenberg.org/cache/epub/37157/pg37157.txt', 'http://www.gutenberg.org/cache/epub/15207/pg15207.txt', 'http://www.gutenberg.org/cache/epub/10773/pg10773.txt', 'http://www.gutenberg.org/cache/epub/31624/pg31624.txt', 'http://www.gutenberg.org/cache/epub/5192/pg5192.txt', 'http://www.gutenberg.org/cache/epub/40030/pg40030.txt', 'http://www.gutenberg.org/cache/epub/50992/pg50992.txt', 'http://www.gutenberg.org/cache/epub/50880/pg50880.txt', 'http://www.gutenberg.org/cache/epub/36525/pg36525.txt', 'http://www.gutenberg.org/cache/epub/47167/pg47167.txt', 'http://www.gutenberg.org/cache/epub/47464/pg47464.txt', 'http://www.gutenberg.org/cache/epub/33397/pg33397.txt', 'http://www.gutenberg.org/cache/epub/29782/pg29782.txt', 'http://www.gutenberg.org/cache/epub/32857/pg32857.txt', 'http://www.gutenberg.org/cache/epub/26262/pg26262.txt'] docs = [requests.get(u).text for u in urls] lxr = LexRank(docs, stopwords=STOPWORDS['en']) nlp = spacy.load('en_core_web_md') def tfidf(span):return sum([tok.prob for tok in span])*1.0/len(span) Span.set_extension("weight", getter=tfidf, force=True) def ranktfidf(doc): return np.argsort(np.array([sent._.weight for sent in doc.sents]))[::-1] Doc.set_extension("ranktfidf", getter=ranktfidf, force=True)
import spacy import lemminflect import logging import typing from spacy.tokens import Span, Doc from spacy.matcher import Matcher from lemminflect import getInflection logging.basicConfig(level=logging.INFO) # DO NOT SET MANUALLY MOD_CONSERVATIVE = False Doc.set_extension("clauses", default=[], force=True) Span.set_extension("clauses", default=[], force=True) dictionary = { "non_ext_copular": """die walk""".split(), "ext_copular": """act appear be become come come out end up get go grow
if adu[idx] == 0: result.append(s) return result def get_features(span): return span.doc._.Features[span._.index] def get_mc(doc): for idx, val in enumerate(list(doc.sents)): if doc._.MC_List[idx] == 1: return val Span.set_extension("Label", getter=get_sentence_label) Span.set_extension("CLPR_Label", getter=get_sentence_label) Span.set_extension("index", getter=get_index) Span.set_extension("Feature", getter=get_features) Span.set_extension("mc", default=0) Token.set_extension("Label", getter=get_token_label) Doc.set_extension("ADU_Sents", getter=get_ADU) Doc.set_extension("Claim_Sents", getter=get_CL) Doc.set_extension("Premise_Sents", getter=get_PR) Doc.set_extension("MC_List", default=[]) Doc.set_extension("MajorClaim", getter=get_mc) Doc.set_extension("sentences", getter=get_sentences) Doc.set_extension("Labels", default=[0]) Doc.set_extension("CLPR_Labels", default=[0])
} all_pronouns = set() inv_pronoun_map = collections.defaultdict(list) for k, pronoun_list in pronoun_lists.items(): for v in pronoun_list: inv_pronoun_map[v].append(k) all_pronouns.add(v) path = MODELS_DIR USE_NOUN_CHUNKS = False import neuralcoref neuralcoref.add_to_pipe(nlp) Span.set_extension("fused_type", default="") Span.set_extension("is_pronoun", default=False) models = [] if CANDIDATE_RECALL: models.append((None, None, collections.defaultdict(lambda: (0,0)))) elif os.path.exists(path): for model_name in listdir(path): sess = tf.Session(graph=tf.Graph()) tf.saved_model.loader.load(sess, ["serve"], os.path.join(path,model_name)) #print([n.name for n in sess.graph.as_graph_def().node][:10]) header_file = os.path.join(path,model_name,"header.txt") model_preds = [] predicate_thresholds = {} if os.path.exists(header_file): with open(header_file, 'r') as file:
def __init__(self, cfg): # TODO fix force=True (isn't supposed to work that way) self.__name__ = 'linker' Span.set_extension("link", default=None, force=True) self.LABEL_URL_MAPPER = cfg['EntityLinker']
for match_id, start, end in structurematcher(span.as_doc()) ] def flagtypematched(span): return [ nlp.vocab.strings[match_id] for match_id, start, end in flagtypematcher(span.as_doc()) ] def isflagmatched(span): return len(isflagmatcher(span.as_doc())) > 0 Span.set_extension("structurematched", getter=structurematched, force=True) Span.set_extension("flagtypematched", getter=flagtypematched, force=True) Span.set_extension("isflagmatched", getter=isflagmatched, force=True) ### Specify regular expressions # Disinformation, fake news, clickbait, unreliable sources regex_flag = re.compile('|'.join([r'\b' + o for o in objects_all + attributes])) fake_regex = re.compile('|'.join( [r'\b' + o for o in objects_full + attributes + ['conspir']])) # Sarcasm and irony sarcasm_regex = "\\\s\\b|\\/s\\b" irony_regex = '|'.join([ "[*'\"“”‘’`´˝˶]" + s
Token.set_extension('is_color', getter=get_is_color) doc = nlp("The sky is blue") print(doc[3]._.is_color, '-', doc[3].text) #Property extensions (2) from spacy.tokens import Span #Define getter function def get_has_color(span): colors = ['red', 'yellow', 'blue'] return any(token.text in colors for token in span) #Set extension on the Span with getter Span.set_extension('has_color', getter=get_has_color) print(doc[1:4]._.has_color, '-', doc[1:4].text) print(doc[0:2]._.has_color, '-', doc[0:2].text) #Method extensions from spacy.tokens import Doc #Define method with arguments def has_token(doc, token_text): in_doc = token_text in [token.text for token in doc] return in_doc #Set extension on the Doc with method Doc.set_extension('has_token', method=has_token)
import requests from requests.adapters import HTTPAdapter import os from sqlitedict import SqliteDict import hashlib from spacy.tokens import Span import json import time import diffbot_nlapi import logging from config import MODEL, NUMBER_URI_CANDIDATES, SOFT_COREF_CANDIDATES # el_candidate has types, uri, score Span.set_extension("el_candidates", default=[]) Span.set_extension("uri_candidates", default=[]) db = SqliteDict(os.path.join('tmp', 'el.db'), autocommit=True) configuration = diffbot_nlapi.Configuration() api_instance = diffbot_nlapi.NaturalLanguageApi( diffbot_nlapi.ApiClient(configuration)) def _get_uri_candidates_from_mention_with_score(mention, score): return [{ 'types': elc["types"], 'uri': elc["uri"], 'score': (2 * score) + elc["score"], 'coref_score': score, 'el_score': elc["score"]
import warnings from typing import Any, Callable, Dict, List, Union from functional import seq from jsonschema import validate from spacy.tokens import Doc, Span from replacy.db import get_match_dict_schema # set known extensions: known_string_extensions = ["description", "match_name", "category", "comment"] known_list_extensions = ["suggestions"] for ext in known_list_extensions: Span.set_extension(ext, default=[], force=True) for ext in known_string_extensions: Span.set_extension(ext, default="", force=True) expected_properties = (["patterns", "match_hook", "test"] + known_list_extensions + known_string_extensions) # set custom extensions for any unexpected keys found in the match_dict def get_novel_prop_defaults(match_dict): """ Also mutates the global Span to add any needed extensions """ novel_properties = (seq(match_dict.values()).flat_map( lambda x: x.keys()).distinct().difference(expected_properties)) novel_prop_defaults: Dict[str, Any] = {} for x in match_dict.values(): for k, v in x.items():
with open("exercises/de/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = spacy.blank("de") matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", list(nlp.pipe(COUNTRIES))) @Language.component("countries_component") def countries_component_function(doc): # Erstelle eine Entitäts-Span mit dem Label "LOC" für alle Resultate matches = matcher(doc) doc.ents = [Span(doc, start, end, label="LOC") for match_id, start, end in matches] return doc # Füge die Komponente zur Pipeline hinzu nlp.add_pipe("countries_component") print(nlp.pipe_names) # Getter-Funktion, die den Text der Span im Lexikon der Hauptstädte nachschlägt get_capital = lambda span: CAPITALS.get(span.text) # Registriere die Span-Erweiterung "capital" mit Getter-Funktion get_capital Span.set_extension("capital", getter=get_capital) # Verarbeite den Text und drucke den Text, das Label und das Attribut capital für jede Entität doc = nlp("Tschechien könnte der Slowakei dabei helfen, ihren Luftraum zu schützen") print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])
def __init__(self, model, vocab, sentence_length, get_features): self._model = model self._vocab = vocab self._get_features = get_features self.sentence_length = sentence_length Span.set_extension(KerasPipe.extension_name, default=0.0)