def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs=attrs)
def test_issue3555(en_vocab): """Test that custom extensions with default None don't break matcher.""" Token.set_extension("issue3555", default=None) matcher = Matcher(en_vocab) pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] matcher.add("TEST", None, pattern) doc = Doc(en_vocab, words=["have", "apple"]) matcher(doc)
def test_issue_1971_3(en_vocab): """Test that pattern matches correctly for multiple extension attributes.""" Token.set_extension("a", default=1, force=True) Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) matcher.add("A", None, [{"_": {"a": 1}}]) matcher.add("B", None, [{"_": {"b": 2}}]) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) assert len(matches) == 4 assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
def test_matcher_extension_set_membership(en_vocab): matcher = Matcher(en_vocab) get_reversed = lambda token: "".join(reversed(token.text)) Token.set_extension("reversed", getter=get_reversed, force=True) pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}] matcher.add("REVERSED", None, pattern) doc = Doc(en_vocab, words=["hi", "bye", "hello"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_extension_attribute(en_vocab): matcher = Matcher(en_vocab) get_is_fruit = lambda token: token.text in ("apple", "banana") Token.set_extension("is_fruit", getter=get_is_fruit, force=True) pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}] matcher.add("HAVING_FRUIT", None, pattern) doc = Doc(en_vocab, words=["an", "apple"]) matches = matcher(doc) assert len(matches) == 1 doc = Doc(en_vocab, words=["an", "aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_issue_1971_4(en_vocab): """Test that pattern matches correctly with multiple extension attribute values on a single token. """ Token.set_extension("ext_a", default="str_a", force=True) Token.set_extension("ext_b", default="str_b", force=True) matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["this", "is", "text"]) pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 matcher.add("TEST", None, pattern) matches = matcher(doc) # Uncommenting this caused a segmentation fault assert len(matches) == 1
def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country)
def test_doc_retokenize_split_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] underscore = [{"a": True, "b": "1"}, {"b": "2"}] attrs = {"lemma": ["los", "angeles"], "_": underscore} retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].lemma_ == "los" assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1].lemma_ == "angeles" assert doc[1]._.a is False assert doc[1]._.b == "2"
def test_underscore_mutable_defaults_dict(en_vocab): """Test that mutable default arguments are handled correctly (see #2581).""" Token.set_extension("mutable", default={}) token1 = Doc(en_vocab, words=["one"])[0] token2 = Doc(en_vocab, words=["two"])[0] token1._.mutable["foo"] = "bar" assert len(token1._.mutable) == 1 assert token1._.mutable["foo"] == "bar" assert len(token2._.mutable) == 0 token1._.mutable["foo"] = "baz" assert len(token1._.mutable) == 1 assert token1._.mutable["foo"] == "baz" token1._.mutable["x"] = [] token1._.mutable["x"].append("y") assert len(token1._.mutable) == 2 assert token1._.mutable["x"] == ["y"] assert len(token2._.mutable) == 0
def test_issue1971(en_vocab): # Possibly related to #2675 and #2671? matcher = Matcher(en_vocab) pattern = [ {"ORTH": "Doe"}, {"ORTH": "!", "OP": "?"}, {"_": {"optional": True}, "OP": "?"}, {"ORTH": "!", "OP": "?"}, ] Token.set_extension("optional", default=False) matcher.add("TEST", None, pattern) doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) # We could also assert length 1 here, but this is more conclusive, because # the real problem here is that it returns a duplicate match for a match_id # that's not actually in the vocab! matches = matcher(doc) assert all([match_id in en_vocab.strings for match_id, start, end in matches])
def __init__(self, nlp: Language, path: str, lang: str='en_US'): path = Path.cwd() / path if not any([nlp, isinstance(nlp, Language)]): raise ValueError('nlp must be of a spaCy Language.') from None if not path.exists(): raise NotADirectoryError('{} does not exist.'.format(path)) from None dic_path, aff_path = ( path / '{}.dic'.format(lang), path / '{}.aff'.format(lang), ) self.hobj = HunSpell(dic_path, aff_path) Token.set_extension('hunspell_spell', default=None) Token.set_extension('hunspell_suggest', getter=self.get_suggestion)
def __init__(self, nlp): self.nlp = nlp Doc.set_extension("outgroup_entities", default=[], force=True) Doc.set_extension("ingroup_entities", default=[], force=True) Token.set_extension("outgroup", default=False, force=True) Token.set_extension("ingroup", default=False, force=True) self.outgroups = Matcher(nlp.vocab) self.outgroups.add("OUTGROUP", None, [{'ENT_TYPE': {"IN": Group_ID.GROUP}}, {"_": {"ATTRIBUTE": "outgroup"}}]) self.ingroups = Matcher(nlp.vocab) self.ingroups.add("INGROUP", None, [{'ENT_TYPE': {"IN": Group_ID.GROUP}}, {"_": {"ATTRIBUTE": "ingroup"}}])
def load_spacy_model(team_file, players_file): nlp = spacy.load('en') # Teams teams = get_teams(team_file) teams = teams[0] # Players player_list = get_players(players_file) component = NFLTeamRecognizer(nlp, teams) nlp.add_pipe(component, last=True) component = NFLPlayerRecognizer(nlp, player_list) nlp.add_pipe(component, last=True) Token.set_extension('template_tag', default=None) Span.set_extension('record_type', default=None) return nlp
def __init__(self, first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME, last_name_extension_name=LastNameListMatcher.EXTENSION_NAME): self.token_extension_name = self.TOKEN_EXTENSION_NAME self.span_extension_name = self.SPAN_EXTENSION_NAME self.doc_extension_name = self.DOC_EXTENSION_NAME self.first_name_extension_name = first_name_extension_name self.last_name_extension_name = last_name_extension_name if not Token.has_extension(self.token_extension_name): Token.set_extension(self.token_extension_name, default=self.ANOT_NONE) if not Span.has_extension(self.span_extension_name): Span.set_extension(self.span_extension_name, getter=self.is_full_name_getter) if not Doc.has_extension(self.doc_extension_name): Doc.set_extension(self.doc_extension_name, default=[])
def add_custom_properties(nlp): def is_symbol_getter(token): return (len(token) == 1 and unicodedata.category(token.text).startswith('S')) # Replace weird behavior when normalizing ('a' -> 'going to' to 'a' -> 'a') special_case = [{ORTH: u'a', NORM: u'a'}] nlp.tokenizer.add_special_case(u'a', special_case) # Naive replacement of "'s" as "is" (could indicate possession) special_case = [{ORTH: u'is', NORM: u'is'}] nlp.tokenizer.add_special_case(u"'s", special_case) # Avoid ('am' -> 'a.m.') special_case = [{ORTH: u'am', NORM: u'am'}] nlp.tokenizer.add_special_case(u"am", special_case) # Add custom token attribute for symbols # token._.is_symbol now returns True if the token is a unicode symbol Token.set_extension('is_symbol', getter=is_symbol_getter, force=True)
def __init__(self, nlp, lang=None, measures=None): """Initialise the pipeline component. """ super(Readability, self).__init__(nlp, lang=lang) lang = lang or nlp.lang # take only supported measures if measures: self.measures = {metric: MEASURE_PARAMETERS[lang][metric] for metric in set(MEASURE_PARAMETERS[lang].keys()) & set(measures)} else: self.measures = MEASURE_PARAMETERS[lang] for metric in ["total_sentences", "total_words", "total_syllables", "total_letters"]: Doc.set_extension(metric, default=None, force=True) Token.set_extension("letters_count", default=None, force=True) for metric in self.measures.keys(): if not Doc.has_extension(metric): Doc.set_extension(metric, getter=getattr(self, metric))
def main(): nlp = spacy.load('en_core_web_sm') fruit_getter = lambda token: token.text in (u"apple", u"pear", u"banana") pid_getter = lambda token: token.text in (u'123a123', u'1234', u'123123123' ) Token.set_extension("is_fruit", getter=fruit_getter) Token.set_extension("is_pid", getter=pid_getter) doc = nlp(u"I have an apple, a pear, and a watermelon") doc2 = nlp(u'123a123 SKF-23-Pump Handle Made to last') assert doc[3]._.is_fruit for token in doc: if token._.is_fruit: print('found: {}'.format(token.text)) for token in doc2: if token._.is_pid: print('{} is a product id'.format(token.text)) print('Done')
def main(): Token.set_extension("extract", default=False) Token.set_extension("weight", default=0.0) Token.set_extension("dist_cit", default=0) Token.set_extension("dist_cit_norm", default=0.0) exp_dir = "/Users/masterman/NLP/PhD/aac/experiments/aac_generate_kw_trace" features_data_filename = os.path.join(exp_dir, "feature_data.json.gz") contexts = FeaturesReader(features_data_filename, 10) render_all(get_spacy_parse(contexts))
def add_tagger(self, tagger, name, additional_fields=[]): r''' Add any kind of a tagger for tokens. Args: tagger (`object/function`): Any object/function that takes a spacy doc as an input, does something and returns the same doc. name (`str`): Name for this component in the pipeline. additional_fields (`List[str]`): Fields to be added to the `_` properties of a token. ''' self.nlp.add_pipe(tagger, name='tag_' + name, first=True) # Add custom fields needed for this usecase Token.set_extension('to_skip', default=False, force=True) # Add any additional fields that are required for field in additional_fields: Token.set_extension(field, default=False, force=True)
def __init__(self, nlp): self.load_dicts() # Token.set_extension('is_neg', default=False, force=True) # Token.set_extension('is_pos', default=False, force=True) Token.set_extension("is_neg", getter=self.is_neg_getter, force=True) Token.set_extension("is_pos", getter=self.is_pos_getter, force=True) Token.set_extension("is_negated", getter=self.is_negated_getter, force=True) Token.set_extension("span_sent", default=None, force=True) Doc.set_extension("has_neg", getter=self.has_neg, force=True) Doc.set_extension("has_pos", getter=self.has_pos, force=True) Span.set_extension("has_neg", getter=self.has_neg, force=True) Span.set_extension("has_pos", getter=self.has_pos, force=True)
def _install_extensions(): K = KNP_USER_KEYS Token.set_extension(K.morph.element, default=None, force=True) for k in ["bunsetsu", "tag"]: Token.set_extension(getattr(K.morph, k), getter=token_to_knp_span(k)) for k in ["bunsetsu", "morph", "tag"]: for feature in ["element", "list_"]: key = getattr(getattr(K, k), feature) Span.set_extension(key, default=None, force=True) for k in ["bunsetsu", "morph", "tag"]: for feature in ["spans", "list_"]: key = getattr(getattr(K, k), feature) Doc.set_extension(key, getter=get_all_knp_features_from_sents(k, feature)) for k in [BUNSETSU, TAG]: Span.set_extension(getattr(KNP_USER_KEYS, k).spans, getter=get_knp_span(k)) Span.set_extension(getattr(KNP_USER_KEYS, k).parent, getter=get_knp_parent(k)) Span.set_extension( getattr(KNP_USER_KEYS, k).children, getter=get_knp_children(k) )
def test_dependency_matcher_span_user_data(en_tokenizer): doc = en_tokenizer("a b c d e") for token in doc: token.head = doc[0] token.dep_ = "a" Token.set_extension("is_c", default=False) doc[2]._.is_c = True pattern = [ {"RIGHT_ID": "c", "RIGHT_ATTRS": {"_": {"is_c": True}}}, ] matcher = DependencyMatcher(en_tokenizer.vocab) matcher.add("C", [pattern]) doc_matches = matcher(doc) offset = 1 span_matches = matcher(doc[offset:]) for doc_match, span_match in zip(sorted(doc_matches), sorted(span_matches)): assert doc_match[0] == span_match[0] for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]): assert doc_t_i == span_t_i + offset
def __init__(self, nlp, patterns, patterns_by_class, default_label=None): """ Initialise the Spacy pipeline component Set up the extensions on the Tokens and Spans. :param nlp: Spacy NLP engine :param patterns: List of dicts of patterns to match on :param patterns_by_class: List of dicts of patterns to match on, grouped by entity type :param default_label: default label to use on matched entities. """ self.nlp = nlp if default_label is None: self.default_label = "CUSTOM" else: self.default_label = default_label _ = self.nlp.tokenizer.vocab[self.default_label] # add string to vocab self.nlp.get_pipe("ner").add_label( self.default_label) # add string to vocab self.patterns = patterns self.patterns_by_class = patterns_by_class # initialise the matcher and add patterns self.keyword_processor = KeywordProcessor() for k, v in self.patterns_by_class.items(): _ = self.nlp.tokenizer.vocab[k] # add string to vocab self.nlp.get_pipe("ner").add_label(k) # add string to vocab self.keyword_processor.add_keywords_from_list( self.patterns_by_class[k]) try: Token.set_extension("original_label", default=None) except ValueError: # do not force overwrite if extension already set pass try: Span.set_extension( "original_label", getter=lambda span: list( set([token._.original_label for token in span])), ) except ValueError: # do not force overwrite if extension already set pass # no callback function on the matcher patterns. logging.debug("PMC Flashget based pattern matcher added.")
def __init__(self, nlp, label='AzureResource'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID patterns = [nlp(org) for org in azureResources] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('AzureResource', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_azure_resource', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_azure_resource == True. Doc.set_extension('has_azure_resource', getter=self.has_azure_resource) Span.set_extension('has_azure_resource', getter=self.has_azure_resource)
def __init__(self, nlp, pattern_id='EmailAddrTagger', attrs=('has_email_addr', 'is_email_addr', 'email_addr'), force_extension=False): """Initialise the pipeline component. nlp (Language): The shared nlp object. Used to initialise the matcher with the shared `Vocab`, and create `Doc` match patterns. pattern_id (unicode): ID of match pattern, defaults to 'EmailAddrTagger'. Can be changed to avoid ID clashes. attrs (tuple): Attributes to set on the ._ property. Defaults to ('has_email_addr', 'is_email_addr', 'email_addr'). force_extension (bool): Force creation of extension objects. RETURNS (callable): A spaCy pipeline component. """ self._has_email_addr, self._is_email_addr, self._email_addr = attrs self.matcher = Matcher(nlp.vocab) # Add email address rule to matcher self._email_addr_re = re.compile(email_expr, re.VERBOSE | re.I | re.UNICODE) email_addr_mask = lambda text: bool(self._email_addr_re.match(text)) email_addr_flag = nlp.vocab.add_flag(email_addr_mask) self.matcher.add('email_addr', None, [{email_addr_flag: True}]) # Add attributes Doc.set_extension(self._has_email_addr, getter=self.has_email_addr, force=force_extension) Doc.set_extension(self._email_addr, getter=self.iter_email_addr, force=force_extension) Span.set_extension(self._has_email_addr, getter=self.has_email_addr, force=force_extension) Span.set_extension(self._email_addr, getter=self.iter_email_addr, force=force_extension) Token.set_extension(self._is_email_addr, default=False, force=force_extension)
def merge_compounds(doc): """ pipeline component to merge compound linked terms in a doc """ Token.set_extension("compound_merge", default=False, force=True) def get_compound(chunk): """ function which returns compound words of a token input: list of a token's left children output: the left most compound term """ for token in list(chunk.root.lefts): if token.dep_ == "compound": return token with doc.retokenize() as retokenizer: for chunk in doc.noun_chunks: if chunk.root.dep_ == "compound": continue left_token = get_compound(chunk) if left_token: # print(doc[left_token.i : chunk.end]) entity_type = "" if left_token.ent_type: entity_type = left_token.ent_type else: entity_type = chunk.root.ent_type_ attrs = {"ENT_TYPE": entity_type, "_": {"compound_merge": True}} retokenizer.merge(doc[left_token.i: chunk.end], attrs=attrs) return doc
def test_doc_retokenize_merge_extension_attrs(en_vocab): Token.set_extension("a", default=False, force=True) Token.set_extension("b", default="nothing", force=True) doc = Doc(en_vocab, words=["hello", "world", "!"]) # Test regular merging with doc.retokenize() as retokenizer: attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}} retokenizer.merge(doc[0:2], attrs=attrs) assert doc[0].lemma_ == "hello world" assert doc[0]._.a is True assert doc[0]._.b == "1" # Test bulk merging doc = Doc(en_vocab, words=["hello", "world", "!", "!"]) with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}}) retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}}) assert doc[0]._.a is True assert doc[0]._.b == "1" assert doc[1]._.a is None assert doc[1]._.b == "2"
def __init__(self, nlp, name="medspacy_concept_tagger", attr_name="concept_tag"): """Create a new ConceptTagger. Params: nlp: A spaCy Language model. attr_name (str): The name of the attribute to set to tokens. """ self.nlp = nlp self.name = name self.attr_name = attr_name self.target_matcher = TargetMatcher(nlp, add_ents=False) self.rules = [] # If the token attribute hasn't been set, add it now try: Token.set_extension(attr_name, default="") except: pass
def _set_extensions(self): """Sets the default extensions if they do not exist yet.""" for obj in Doc, Span, Token: if not obj.has_extension(self.ext_names["conll_str"]): obj.set_extension(self.ext_names["conll_str"], default=None) if not obj.has_extension(self.ext_names["conll"]): obj.set_extension(self.ext_names["conll"], default=None) if PD_AVAILABLE and not self.disable_pandas: if not obj.has_extension(self.ext_names["conll_pd"]): obj.set_extension(self.ext_names["conll_pd"], default=None) # Adds fields from the CoNLL-U format that are not available in spaCy # However, ConllParser might set these fields when it has read CoNLL_str->spaCy if not Token.has_extension("conll_deps_graphs_field"): Token.set_extension("conll_deps_graphs_field", default="_") if not Token.has_extension("conll_misc_field"): Token.set_extension("conll_misc_field", default="_") if not Span.has_extension("conll_metadata"): Span.set_extension("conll_metadata", default=None)
def __call__(self, doc): """ Runs the document through the Table Matcher Component. Uses regex patterns to identify terms that likely came from a table in the unstructured text. :param doc: :return: """ logging.debug("Called Table Matcher Component") TABLE_PATTERN = re.compile(r'^(.*?)[ \t]{3,}\d+') Token.set_extension('feature_is_from_table', default=False, force=True) for match in re.finditer(TABLE_PATTERN, doc.text): start, end = match.span() span = doc.char_span(start, end) if span is None: continue for token in span: token._.set('feature_is_from_table', True) return doc
def __init__(self, data_dir=DATA_DIR, lefff_file_name=LEFFF_FILE_NAME, after_melt=False): LOGGER.info('New LefffLemmatizer instantiated.') # register your new attribute token._.lefff_lemma if not Token.get_extension(self.name): Token.set_extension(self.name, default=None) else: LOGGER.info('Token {} already registered'.format(self.name)) # In memory lemma mapping self.lemma_dict = {} self.after_melt = after_melt with io.open(os.path.join(data_dir, lefff_file_name), encoding='utf-8') as lefff_file: LOGGER.info('Reading lefff data...') for line in lefff_file: els = line.split('\t') self.lemma_dict[(els[0], els[1])] = els[2] LOGGER.info('Successfully loaded lefff lemmatizer')
def __init__(self, spacy_pipeline): self.nlp = spacy_pipeline Token.set_extension('feature_is_time_unit', default=False) self.nlp.entity.add_label('time_unit') self.time_matcher = Matcher(self.nlp.vocab) self.time_matcher.add('UNIT_OF_TIME', None, [{ 'LOWER': 'sec' }], [{ 'LOWER': 'second' }], [{ 'LOWER': 'seconds' }], [{ 'LOWER': 'min' }], [{ 'LOWER': 'minute' }], [{ 'LOWER': 'minutes' }], [{ 'LOWER': 'hr' }], [{ 'LOWER': 'hour' }], [{ 'LOWER': 'day' }], [{ 'LOWER': 'days' }], [{ 'LOWER': 'week' }], [{ 'LOWER': 'weeks' }], [{ 'LOWER': 'month' }], [{ 'LOWER': 'months' }], [{ 'LOWER': 'year' }], [{ 'LOWER': 'years' }], [{ 'LOWER': 'yrs' }])
def __init__(self, use_spacy=False, spacy_extensions={}): """ spacy_extensions looks like {"Tokens": {"name": "mask", "kwargs": {"default": False}}} """ if use_spacy: import spacy if spacy_extensions: from spacy.tokens import Token allowed_keys = ["Tokens"] for key, settings_list in spacy_extensions.items(): # This code sucks, but you get the idea for settings in settings_list: if key in allowed_keys: Token.set_extension(settings["name"], **settings["kwargs"]) self.nlp = spacy.load("en_core_web_sm") else: pass
def __init__(self, nlp, companies=tuple(), label='ORG'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(org) for org in companies] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('TECH_ORGS', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_tech_org', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension('has_tech_org', getter=self.has_tech_org)
def test_span_as_doc_user_data(doc): """Test that the user_data can be preserved (but not by default).""" my_key = "my_info" my_value = 342 doc.user_data[my_key] = my_value Token.set_extension("is_x", default=False) doc[7]._.is_x = True span = doc[4:10] span_doc_with = span.as_doc(copy_user_data=True) span_doc_without = span.as_doc() assert doc.user_data.get(my_key, None) is my_value assert span_doc_with.user_data.get(my_key, None) is my_value assert span_doc_without.user_data.get(my_key, None) is None for i in range(len(span_doc_with)): if i != 3: assert span_doc_with[i]._.is_x is False else: assert span_doc_with[i]._.is_x is True assert not any([t._.is_x for t in span_doc_without])
def install_extensions(): K = KNP_USER_KEYS Token.set_extension(K.morph.element, default=None, force=True) for k in [ K.bunsetsu.element, K.tag.element, K.bunsetsu.list_, K.morph.list_, K.tag.list_, ]: Span.set_extension(k, default=None, force=True) for k in ["bunsetsu", "morph", "tag"]: Doc.set_extension(getattr(K, k).list_, getter=get_all_knp_list_from_sents(k)) for k in [BUNSETSU, TAG]: Span.set_extension(getattr(KNP_USER_KEYS, k).spans, getter=get_knp_span(k)) Span.set_extension(getattr(KNP_USER_KEYS, k).parent, getter=get_knp_parent(k)) Span.set_extension(getattr(KNP_USER_KEYS, k).children, getter=get_knp_children(k))
class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} if not Token.get_extension('inf'): Token.set_extension('inf', default='') if not Token.get_extension('reading'): Token.set_extension('reading', default='') if not Token.get_extension('sudachi'): Token.set_extension('sudachi', default='') if not Token.get_extension('bunsetu_index'): Token.set_extension('bunsetu_index', default='') if not Token.get_extension('bunsetu_bi_label'): Token.set_extension('bunsetu_bi_label', default='') if not Token.get_extension('bunsetu_position_type'): Token.set_extension('bunsetu_position_type', default='') @classmethod def create_tokenizer(cls, nlp=None): return SudachiTokenizer(nlp) @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): return None
def __init__(self, nlp, terms_dict, label='EMP_TYPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of terms is long, it's very efficient self.matcher = PhraseMatcher(nlp.vocab) for match_label in terms_dict.keys(): patterns = [nlp(term) for term in terms_dict[match_label]] # patterns = [nlp(term) for term in terms] self.matcher.add(match_label, None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_emp_type', default=False, force=True) Token.set_extension('is_part_time', default=False, force=True) Token.set_extension('is_full_time', default=False, force=True) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_emp_type == True. Doc.set_extension('has_emp_type', getter=self.has_emp_type, force=True) Span.set_extension('has_emp_type', getter=self.has_emp_type, force=True) Doc.set_extension('has_part_time', getter=self.has_part_time, force=True) Span.set_extension('has_part_time', getter=self.has_part_time, force=True) Doc.set_extension('has_full_time', getter=self.has_full_time, force=True) Span.set_extension('has_full_time', getter=self.has_full_time, force=True)
def test_matcher_subset_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 3 doc[0].set_morph("Feat=Val") assert len(matcher(doc)) == 3 doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 3 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 2 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 2 # IS_SUBSET acts like "IN" for attrs other than MORPH matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 1 # IS_SUBSET with an empty list matches nothing matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUBSET": []}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 0 # IS_SUBSET with a list value Token.set_extension("ext", default=[]) matcher = Matcher(en_vocab) pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0]._.ext = ["A"] doc[1]._.ext = ["C", "D"] assert len(matcher(doc)) == 2
def custom_extensions(doc): lemmatizer = GermaLemma() negation_words = set(["nie", "keinsterweise", "keinerweise", "niemals", "nichts", "kaum", "keinesfalls", "ebensowenig", "nicht", "kein", "keine", "weder"]) negation_cconj = set(['aber', 'jedoch', 'doch', 'sondern']) def lemma_getter(token): # if " " in token.text: # return token.lemma_.lower() try: return lemmatizer.find_lemma(token.text, token.tag_).lower() except: return token.lemma_.lower() def is_negation_getter(token): if token._.lemma in negation_words: return True else: return False def is_sentence_break_getter(token): if token._.lemma in negation_cconj: return True else: return False Token.set_extension("lemma", getter=lemma_getter, force=True) Token.set_extension("is_negation", getter=is_negation_getter, force=True) Token.set_extension("is_sentence_break", getter=is_sentence_break_getter, force=True) return doc
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs): Token.set_extension("x", default=False, force=True) Token.set_extension("a", getter=lambda x: x, force=True) Token.set_extension("b", method=lambda x: x, force=True) doc = Doc(en_vocab, words=["LosAngeles", "start"]) attrs = {"_": underscore_attrs} with pytest.raises(ValueError): with doc.retokenize() as retokenizer: heads = [(doc[0], 1), doc[1]] retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
def main(test_data_dir, experiment_dir, corpus): Token.set_extension("split_start", getter=get_token_split_start) Token.set_extension("split_end", getter=get_token_split_end) Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False lang.ru.Russian.Defaults.use_pymorphy2 = False nlp = load_nlp(experiment_dir, corpus) treebank_code = nlp.meta["treebank"] for section in ("test", "dev"): if section == "dev": section_dir = "conll17-ud-development-2017-03-19" else: section_dir = "conll17-ud-test-2017-05-09" text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt") udpipe_path = ( test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu") ) gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu") header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"] print("\t".join(header)) inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path} for input_type in ("udp", "raw"): input_path = inputs[input_type] output_path = ( experiment_dir / corpus / "{section}.conllu".format(section=section) ) parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path) accuracy = print_results(input_type, test_scores) acc_path = ( experiment_dir / corpus / "{section}-accuracy.json".format(section=section) ) srsly.write_json(acc_path, accuracy)
str(i + 1), token.text, token.lemma_, token.pos_, token.tag_, "_", str(head), token.dep_.lower(), "_", "_", ] lines.append("\t".join(fields)) return "\n".join(lines) Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) ################## # Initialization # ################## def load_nlp(corpus, config): lang = corpus.split("_")[0] nlp = spacy.blank(lang) if config.vectors: nlp.vocab.from_disk(config.vectors / "vocab") return nlp