class TechCompanyRecognizer(object): """Example of a spaCy v2.0 pipeline component that sets entity annotations based on list of single or multiple-word company names. Companies are labelled as ORG and their spans are merged into one token. Additionally, ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token respectively.""" name = 'tech_companies' # component name, will show up in the pipeline def __init__(self, nlp, companies=tuple(), label='ORG'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(org) for org in companies] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('TECH_ORGS', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_tech_org', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension('has_tech_org', getter=self.has_tech_org) def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ matches = self.matcher(doc) spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in matches: # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity for token in entity: token._.set('is_tech_org', True) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc # don't forget to return the Doc! def has_tech_org(self, tokens): """Getter for Doc and Span attributes. Returns True if one of the tokens is a tech org. Since the getter is only called when we access the attribute, we can refer to the Token's 'is_tech_org' attribute here, which is already set in the processing step.""" return any([t._.get('is_tech_org') for t in tokens])
def test_issue3248_1(): """Test that the PhraseMatcher correctly reports its number of rules, not total number of patterns.""" nlp = English() matcher = PhraseMatcher(nlp.vocab) matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) matcher.add("TEST2", None, nlp("d")) assert len(matcher) == 2
def test_issue3248_2(): """Test that the PhraseMatcher can be pickled correctly.""" nlp = English() matcher = PhraseMatcher(nlp.vocab) matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) matcher.add("TEST2", None, nlp("d")) data = pickle.dumps(matcher) new_matcher = pickle.loads(data) assert len(new_matcher) == len(matcher)
def get_matches(tokenizer, phrases, texts, max_length=6): matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) matcher.add('Phrase', None, *phrases) for text in texts: doc = tokenizer(text) for w in doc: _ = doc.vocab[w.text] matches = matcher(doc) for ent_id, start, end in matches: yield (ent_id, doc[start:end].text)
def test_issue3331(en_vocab): """Test that duplicate patterns for different rules result in multiple matches, one per rule. """ matcher = PhraseMatcher(en_vocab) matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) matches = matcher(doc) assert len(matches) == 2 match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] assert sorted(match_ids) == ["A", "B"]
class AdversativeConnectivesTagger: ''' This tagger has the task to find all adversative connectives in a document. It needs to go after the 'Tagger' pipeline component. ''' name = 'adversative connective tagger' def __init__(self, nlp, language: str='es') -> None: ''' This constructor will initialize the object that tags adversative connectives. Parameters: nlp: The Spacy model to use this tagger with. language: The language that this pipeline will be used in. Returns: None. ''' if not language in ACCEPTED_LANGUAGES: raise ValueError(f'Language {language} is not supported yet') self._language = language self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER') self._connectives = [] if language == 'es': # Adversative connectives for spanish self._connectives = ['pero', 'sino', 'no obstante', 'sino que', 'sin embargo', 'pero sí', 'aunque', 'menos', 'solo', 'excepto', 'salvo', 'más que', 'en cambio', 'ahora bien', 'más bien'] else: # Support for future languages pass for con in self._connectives: self._matcher.add(con, None, nlp(con)) def __call__(self, doc: Doc) -> Doc: ''' This method will find all adversative connectives and store them in an iterable. Parameters: doc(Doc): A Spacy document. ''' matches = self._matcher(doc) adversative_connectives_spans = [doc[start:end] for _, start, end in matches] doc._.adversative_connectives_span_indices = [{'start': span.start, 'end': span.end, 'label': span.label} for span in filter_spans(adversative_connectives_spans)] # Save the causal connectives found return doc
def _match_no_category( self, list_of_tags: List[AnyStr], list_of_keywords: List[AnyStr], ) -> None: """Tokenize keywords for every language. Instanciate PhraseMatcher with associated tags""" for language in self.tokenizer.spacy_nlp_dict: patterns = self._tokenize_keywords(language, list_of_tags, list_of_keywords) self.tokenizer.spacy_nlp_dict[language].remove_pipe("sentencizer") matcher = PhraseMatcher( self.tokenizer.spacy_nlp_dict[language].vocab, attr=get_phrase_matcher_attr(self.lemmatization), ) matcher.add("PatternList", patterns) self._matcher_dict[language] = matcher
def matcherR(): nlp = spacy.load('en') if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) else: ner = nlp.get_pipe('ner') label = 'GEOLOC' matcher = PhraseMatcher(nlp.vocab) text = open('onlylocations.csv','r') for i in text: matcher.add(label, None,nlp(i)) one = nlp('Chennai has been affected severly beacuse of the tsunami') matches = matcher(one) [match for match in matches] print(matches)
def string_to_nlp(s): matcher = PhraseMatcher(nlp.vocab) states = [ 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal', 'Andaman and Nicobar', 'Chandigarh', 'Dadra Nagar ', 'Haveli ', 'Daman ', 'Diu', 'Delhi', 'Jammu and Kashmir', 'Ladakh', 'Lakshadweep', 'Puducherry' ] # Only run nlp.make_doc to speed things up patterns = [nlp.make_doc(text) for text in states] matcher.add("Indian State", None, *patterns) return nlp(s)
def lexical_dangerous(nlp_doc): strArr = [] terms = [ "all", "each", "every", "any", "few", "little", "many", "much", "several", "some", "a lot" ] lexical_dangerous_plural = [nlp(text) for text in terms] doc = nlp(str(nlp_doc)) phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER') phrase_matcher.add("lexicalplural", None, *lexical_dangerous_plural) matches = phrase_matcher(doc) for match_id, start, end in matches: span = doc[start:end] strArr.append(span.text) return strArr
def create_phrase_matcher(phrases, nlp=None, label="TRIGGER"): if nlp == None: nlp = load_spacy_model(output_type='doc', tokenizer_only=True, verbose=False) # Create phrase matcher from spacy.matcher import PhraseMatcher # creates a phrase matcher using nlp model's vocabulary, matching on the LOWER attribute matcher = PhraseMatcher(nlp.vocab, attr='LOWER') patterns = list(nlp.tokenizer.pipe(phrases)) matcher.add(label, None, *patterns) # If pattern matches, return match label as label return matcher
def is_continuance_word(nlp_doc): strArr = [] terms = [ "below:", "as follows:", "following:", "listed:", "in particular:", "support:", " and ", ":" ] continuance = [nlp(text) for text in terms] doc = nlp(str(nlp_doc)) phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER') phrase_matcher.add("continuance", None, *continuance) matches = phrase_matcher(doc) for match_id, start, end in matches: span = doc[start:end] strArr.append(span.text) return strArr
def setUp(self): # Keywords nlp = spacy.load("en_core_web_sm") keywords = ['hello world', 'Sparty', 'A. thaliana', 'protein 5'] patterns = [nlp.make_doc(keyword) for keyword in keywords] # Text txt = ('Hello world, my name is Sparty. My research is about ' 'A. thaliana protein 5. Hello.') self.doc = nlp(txt) # Matcher matcher = PhraseMatcher(nlp.vocab, attr="LOWER") matcher.add("Keywords", patterns) self.matches = matcher(self.doc)
def return_people(self): nlp = spacy.load("en_core_web_sm") doc = nlp(self.text) people = ["Doctor Kimble", "Postman", "Deli Man", "Plumber"] people_patterns = list(nlp.pipe(people)) peopleMatcher = PhraseMatcher(nlp.vocab) peopleMatcher.add("PEOPLE", [*people_patterns]) for match_id, start, end in peopleMatcher(doc): peopleSpan = Span(doc, start, end, label="PEOPLE") return peopleSpan
class EntityMatcher(object): name = 'entity_matcher' def __init__(self, nlp, terms, label): patterns = [nlp(term) for term in terms] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(label, None, *patterns) def __call__(self, doc): matches = self.matcher(doc) spans = [] for label, start, end in matches: span = Span(doc, start, end, label=label) spans.append(span) doc.ents = spans return doc
def find_associated(word, docs): """Returns a list of words that are associated with a word in sentences belonging to texts of a corpus""" associated = [] matcher = PhraseMatcher(nlp.vocab) # terminology_list = [] patterns = nlp(word) matcher.add('TerminologyList',*patterns) for iD in docs: texts = nlp(docs[iD]) for sent in texts.sents: sent = sent.as_doc() matches = matcher(sent) if len(matches) > 0 and sent[matches[0][1]+1].head.text not in associated: associated.append(sent[matches[0][1]+1].head.text) return associated
def sentence_Incompletes(nlp_doc): strArr = [] terms = [ "TBD", "TBS", "TBE", "TBC", "TBR", "not defined", "not determined", "but not limited to", "as a minimum" ] incompletes = [nlp(text) for text in terms] doc = nlp(str(nlp_doc)) phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER') phrase_matcher.add("Incompletes", None, *incompletes) matches = phrase_matcher(doc) for match_id, start, end in matches: span = doc[start:end] strArr.append(span.text) return strArr
def return_rooms(self): nlp = spacy.load("en_core_web_sm") doc = nlp(self.text) rooms = ["kitchen", "bedroom", "bathroom", "hallway", "living room"] room_patterns = list(nlp.pipe(rooms)) roomMatcher = PhraseMatcher(nlp.vocab) roomMatcher.add("ROOM", [*room_patterns]) for match_id, start, end in roomMatcher(doc): # Create a Span with the label for "GPE" roomSpan = Span(doc, start, end, label="ROOM") return roomSpan
def main(): nlp = spacy.load('en_core_web_sm') matcher = PhraseMatcher(nlp.vocab) pattern1 = nlp('Golden Retriever') pattern2 = nlp('Chicken') matcher.add('DOG', None, pattern1) matcher.add('FOWL', None, pattern2) doc = nlp( "I have a Golden Chicken and a Golden Retrievable Fish and a Golden Retriever" ) # Iterate over the matches for match_id, start, end in matcher(doc): # Get the matched span span = doc[start:end] print('Matched span:', span.text)
def return_people(): people = ["Doctor Kimble", "Postman", "Deli Man", "Plumber"] people_patterns = list(nlp.pipe(people)) peopleMatcher = PhraseMatcher(nlp.vocab) peopleMatcher.add("PEOPLE", None, *people_patterns) for match_id, start, end in peopleMatcher(doc): # Create a Span with the label for "GPE" peopleSpan = Span(doc, start, end, label="PEOPLE") #peopleSpan = Span(doc, start, end, label="PEOPLE") #objectSpan = Span(doc, start, end, label="OBJECTS") # Overwrite the doc.ents and add the span # doc.ents = list(doc.ents) + [roomSpan] + [peopleSpan] + [objectSpan] # Print the text of the span root's head token and the span text print(peopleSpan.text)
def test_span_v_doc_in_phrasematcher(en_vocab): """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc""" doc = Doc(en_vocab, words=[ "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans", "and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs", "everywhere" "." ]) span = doc[9:15] # second clause pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) matcher = PhraseMatcher(en_vocab) matcher.add("SPACY", [pattern]) matches_doc = matcher(doc) matches_span = matcher(span) assert len(matches_doc) == 3 assert len(matches_span) == 1
def return_objects(): objects = ["can of coke", "coke can", "wallet", "purse", "bottle of water"] object_patterns = list(nlp.pipe(objects)) objectMatcher = PhraseMatcher(nlp.vocab) objectMatcher.add("OBJECTS", None, *object_patterns) for match_id, start, end in objectMatcher(doc): # Create a Span with the label for "GPE" objectSpan = Span(doc, start, end, label="OBJECTS") #peopleSpan = Span(doc, start, end, label="PEOPLE") #objectSpan = Span(doc, start, end, label="OBJECTS") # Overwrite the doc.ents and add the span # doc.ents = list(doc.ents) + [roomSpan] + [peopleSpan] + [objectSpan] # Print the text of the span root's head token and the span text print(objectSpan.text)
def dfprep(json_in, save_df, inputfile): if inputfile == 1: with open("input.txt", "r") as f: para = ast.literal_eval(f.read()) json_in = para['json_in'] save_df = para['save_df'] with mlflow.start_run() as mlrun: print(subprocess.getoutput("python -m spacy download en_core_web_sm")) artpd = pd.read_json(json_in, orient='index', convert_dates=False, convert_axes=False) artpda = artpd[artpd.abstract.notnull()].copy() artpda = artpda[artpd.title.notnull()] # artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8]) artpdak = artpda[artpda.keywords.str.len() > 0].copy() dataf = pd.DataFrame( index=artpdak.index, columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey']) dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract dataf.loc[:, 'keywords'] = artpdak.keywords svoc = spacy.load("en_core_web_sm") matcher = PhraseMatcher(svoc.vocab, attr="LOWER") for pmid in dataf.index: t0 = dataf.loc[pmid] patterns = [svoc.make_doc(str(name)) for name in t0.keywords] matcher.add("Names", None, *patterns) doc = svoc(t0.SRC) t1 = ['O'] * (len(doc)) matched = [] matn = 0 for _, start, end in matcher(doc): t1[start] = 'B' t1[start + 1:end] = 'I' * (end - start - 1) if str(doc[start:end]).lower() not in matched: matn = matn + 1 matched.append(str(doc[start:end]).lower()) abskw = [] for x in t0.keywords: if x.lower() not in matched: abskw.append(x) dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1]) dataf.loc[pmid, 'Extracted'] = matn dataf.loc[pmid, 'abskey'] = abskw matcher.remove("Names") dataf.to_pickle(save_df)
def search_for_keyword(keyword, doc_obj, nlp): phrase_matcher = PhraseMatcher(nlp.vocab) li = list(keyword.split(" ")) patterns = [nlp.make_doc(text) for text in li] phrase_matcher.add("TerminologyList", patterns) matched_items = phrase_matcher(doc_obj) matched_text = [] for match_id, start, end in matched_items: text = nlp.vocab.strings[match_id] span = doc_obj[start:end] matched_text.append(span.sent.text) print("Found ", len(matched_text), " matches") for txt in matched_text: print("Found Match ........") print(txt) print("\n")
class Singletons: __instance = None cached_signals = phrase_matcher = nlp = None @staticmethod def get_instance(): """ Static access method. """ if Singletons.__instance is None: logger.info("Calling Singletone private constructor") Singletons() return Singletons.__instance def __init__(self): if Singletons.__instance is not None: raise Exception("This class is a singleton!") else: logger.info("Creating empty cache for signals") self.cached_signals = {} logger.info("Making the question PhraseMatcher") self.nlp = spacy.load("en_core_web_sm") self.phrase_matcher = PhraseMatcher(self.nlp.vocab, attr='LOWER') question_terms = [ "who", "whom", "whose", "what", "when", "where", "why", "which", "how" ] patterns = [self.nlp(text) for text in question_terms] self.phrase_matcher.add("question", None, *patterns) Singletons.__instance = self def get_cached_signals(self): """ This method gets the cached signal dict @return: signal dict format {"prod_id":[list of signal object]} """ return self.cached_signals def set_cached_signals(self, product_id, signals): if str(product_id) not in self.cached_signals.keys(): self.cached_signals[str(product_id)] = signals return self.cached_signals def get_phrase_matcher(self, doc): return self.phrase_matcher(doc) def return_nlp(self, sentence): return self.nlp(sentence)
def get_gender(self) -> str: """ Based on sentences where the Respondent/Applicant keywords are found, count the instances of gendered pronouns. This approach assumes the sentence refers to subject in shorthand by using gendered pronouns as opposed to keywords multiple times in sentence. This function returns a final result to be packaged into json. Parameters: spacy.doc (obj): Returns: str: Gender """ # Search terms formatted phrases = [ "respondent", "respondents", "applicant", 'filed an application' ] patterns = [nlp(text) for text in phrases] # Gender constants male_prons = ['he', "he's", 'his', 'himself'] female_prons = ['she', "she's", 'her', 'herself'] # Variables for analysis storage male_found = [] female_found = [] # PhraseMatcher setup, add tag (RESP) and pass in patterns phrase_matcher = PhraseMatcher(nlp.vocab, attr='LEMMA') phrase_matcher.add("RESP", None, *patterns) # Sentences with both 'RESP' tag and gendered pronouns added to respective list for sent in self.doc.sents: for match_id, _, _ in phrase_matcher(nlp(sent.text)): if nlp.vocab.strings[match_id] in ['RESP', *male_prons]: male_found.append(sent.text) elif nlp.vocab.strings[match_id] in ['RESP', *female_prons]: female_found.append(sent.text) # Make `set()` of list to eliminate duplicates and compare lengths if len(set(female_found)) > len(set(male_found)): return "Female" elif len(set(male_found)) > len(set(female_found)): return "Male" else: return "Unknown"
def return_objects(self): nlp = spacy.load("en_core_web_sm") doc = nlp(self.text) objectJson = open(self.objectFilePath, "r") objects = json.loads(objectJson.read()) object_patterns = list(nlp.pipe(objects)) objectMatcher = PhraseMatcher(nlp.vocab) objectMatcher.add("OBJECTS", [*object_patterns]) for match_id, start, end in objectMatcher(doc): objectSpan = Span(doc, start, end, label="OBJECTS") return objectSpan
class EntityMatcher(object): name = 'entity_matcher' def __init__(self, nlp, terms_list, labels_list): idx = 0 for terms in terms_list: patterns = [nlp(text) for text in terms] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(labels_list[idx], None, *patterns) idx = idx + 1 def __call__(self, doc): matches = self.matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=match_id) doc.ents = list(doc.ents) + [span] return doc
def patternGenerator(self, doc, pattern_path): """ Returns the starting and ending character index for a phrase match along with the phrase """ phrase = namedtuple('Phrase', ['text', 'start', 'end']) phrases = self.readPatterns(self._nlp.tokenizer, pattern_path) matcher = PhraseMatcher(self._nlp.tokenizer.vocab, max_length=6) matcher.add("Phrase", None, *phrases) for w in doc: _ = doc.vocab[w.text] matches = matcher(doc) for ent_id, start, end in matches: yield phrase( text=doc[start:end].text, start=doc[start:end].start_char, end=doc[start:end].end_char, )
def test_span_in_phrasematcher(en_vocab): """Ensure that PhraseMatcher accepts Span and Doc as input""" # fmt: off words = [ "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "." ] # fmt: on doc = Doc(en_vocab, words=words) span = doc[:8] pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) matcher = PhraseMatcher(en_vocab) matcher.add("SPACY", [pattern]) matches_doc = matcher(doc) matches_span = matcher(span) assert len(matches_doc) == 1 assert len(matches_span) == 1
def doccano_snippet_training_out(self, docs, terms, path): patterns = [self.nlp.make_doc(text) for text in terms] matcher = PhraseMatcher(self.nlp.vocab) matcher.add("TerminologyList", None, *patterns) snippets = [] for doc in docs: match_start_spans = [] matches = matcher(doc) for match_id, start, end in matches: span = doc[start:end] match_start_spans.append(start) snippets.append(json.dumps({"text": span.sent.text})) with open(path, 'w') as fh: for snippet in snippets: fh.write(snippet+"\n")
def test_phrase_matcher_bool_attrs(en_vocab): words1 = ["Hello", "world", "!"] words2 = ["No", "problem", ",", "he", "said", "."] pattern = Doc(en_vocab, words=words1) matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT") matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=words2) matches = matcher(doc) assert len(matches) == 2 match_id1, start1, end1 = matches[0] match_id2, start2, end2 = matches[1] assert match_id1 == en_vocab.strings["TEST"] assert match_id2 == en_vocab.strings["TEST"] assert start1 == 0 assert end1 == 3 assert start2 == 3 assert end2 == 6
class JeevesSkills: def __init__(self, nlp, name, label="SKILL"): r = requests.get("https://restcountries.eu/rest/v2/all") r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup self.countries = {c["name"]: c for c in countries} self.label = label # Set up the PhraseMatcher with Doc patterns for each country name self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add("SKILLS", [nlp.make_doc(c) for c in self.countries.keys()]) # Register attributes on the Span. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Span.set_extension("jenkins_skill", default=None, force=True) #Span.set_extension("country_capital", default=None) #Span.set_extension("country_latlng", default=None) #Span.set_extension("country_flag", default=None) # Register attribute on Doc via a getter that checks if the Doc # contains a country entity Doc.set_extension("jenkins_has_skill", getter=self.jenkins_has_skill, force=True) def __call__(self, doc): spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in self.matcher(doc): # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) # Set custom attributes on entity. Can be extended with other data # returned by the API, like currencies, country code, calling code etc. entity._.set("jenkins_skill", True) #entity._.set("country_capital", self.countries[entity.text]["capital"]) #entity._.set("country_latlng", self.countries[entity.text]["latlng"]) #entity._.set("country_flag", self.countries[entity.text]["flag"]) spans.append(entity) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + spans return doc # don't forget to return the Doc! def jenkins_has_skill(self, doc): """Getter for Doc attributes. Since the getter is only called when we access the attribute, we can refer to the Span's 'is_skill' attribute here, which is already set in the processing step.""" return any([entity._.get("jenkins_skill") for entity in doc.ents])
class SpacyAffirmationDetector(DialogActDetector): def __init__(self): self._nlp = NLP.get('en') self._phrase_matcher_negative = PhraseMatcher(self._nlp.vocab, attr="LOWER") negated_affirmation_phrase_patterns = [ self._nlp.make_doc(text) for text in negated_affirmation_phrases ] self._phrase_matcher_negative.add("NegatedAffirmationPhrases", None, *negated_affirmation_phrase_patterns) self._phrase_matcher = PhraseMatcher(self._nlp.vocab, attr="LOWER") affirmation_phrase_patterns = [ self._nlp.make_doc(text) for text in affirmation_phrases ] self._phrase_matcher.add("AffirmationPhrases", None, *affirmation_phrase_patterns) self._token_matcher = Matcher(self._nlp.vocab) self._token_matcher.add("Affirmation", None, *affirmation_token_patterns) def detect(self, text, language='en'): doc = self._nlp(text) negated_phrase_matches = self._phrase_matcher_negative(doc) if negated_phrase_matches: (match_id, start, end) = negated_phrase_matches[0] span = doc[start:end] return False, span.text phrase_matches = self._phrase_matcher(doc) if phrase_matches: (match_id, start, end) = phrase_matches[0] span = doc[start:end] return True, span.text token_matches = self._token_matcher(doc) if token_matches: (match_id, start, end) = token_matches[0] span = doc[start:end] return True, span.text return False, None
def test_issue4002(en_vocab): """Test that the PhraseMatcher can match on overwritten NORM attributes.""" matcher = PhraseMatcher(en_vocab, attr="NORM") pattern1 = Doc(en_vocab, words=["c", "d"]) assert [t.norm_ for t in pattern1] == ["c", "d"] matcher.add("TEST", [pattern1]) doc = Doc(en_vocab, words=["a", "b", "c", "d"]) assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] matches = matcher(doc) assert len(matches) == 1 matcher = PhraseMatcher(en_vocab, attr="NORM") pattern2 = Doc(en_vocab, words=["1", "2"]) pattern2[0].norm_ = "c" pattern2[1].norm_ = "d" assert [t.norm_ for t in pattern2] == ["c", "d"] matcher.add("TEST", [pattern2]) matches = matcher(doc) assert len(matches) == 1
class RESTCountriesComponent(object): """spaCy v2.0 pipeline component that requests all countries via the REST Countries API, merges country names into one token, assigns entity labels and sets attributes on country tokens. """ name = 'rest_countries' # component name, will show up in the pipeline def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country) def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ matches = self.matcher(doc) spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in matches: # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity # Can be extended with other data returned by the API, like # currencies, country code, flag, calling code etc. for token in entity: token._.set('is_country', True) token._.set('country_capital', self.countries[entity.text]['capital']) token._.set('country_latlng', self.countries[entity.text]['latlng']) token._.set('country_flag', self.countries[entity.text]['flag']) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc # don't forget to return the Doc! def has_country(self, tokens): """Getter for Doc and Span attributes. Returns True if one of the tokens is a country. Since the getter is only called when we access the attribute, we can refer to the Token's 'is_country' attribute here, which is already set in the processing step.""" return any([t._.get('is_country') for t in tokens])