Пример #1
0
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[0:2], attrs=attrs)
Пример #2
0
def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
Пример #3
0
def test_issue_1971_3(en_vocab):
    """Test that pattern matches correctly for multiple extension attributes."""
    Token.set_extension("a", default=1, force=True)
    Token.set_extension("b", default=2, force=True)
    doc = Doc(en_vocab, words=["hello", "world"])
    matcher = Matcher(en_vocab)
    matcher.add("A", None, [{"_": {"a": 1}}])
    matcher.add("B", None, [{"_": {"b": 2}}])
    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
    assert len(matches) == 4
    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
Пример #4
0
def test_matcher_extension_attribute(en_vocab):
    matcher = Matcher(en_vocab)
    get_is_fruit = lambda token: token.text in ("apple", "banana")
    Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
    pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
    matcher.add("HAVING_FRUIT", None, pattern)
    doc = Doc(en_vocab, words=["an", "apple"])
    matches = matcher(doc)
    assert len(matches) == 1
    doc = Doc(en_vocab, words=["an", "aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
Пример #5
0
def test_matcher_extension_set_membership(en_vocab):
    matcher = Matcher(en_vocab)
    get_reversed = lambda token: "".join(reversed(token.text))
    Token.set_extension("reversed", getter=get_reversed, force=True)
    pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
    matcher.add("REVERSED", None, pattern)
    doc = Doc(en_vocab, words=["hi", "bye", "hello"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
Пример #6
0
def test_issue_1971_4(en_vocab):
    """Test that pattern matches correctly with multiple extension attribute
    values on a single token.
    """
    Token.set_extension("ext_a", default="str_a", force=True)
    Token.set_extension("ext_b", default="str_b", force=True)
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=["this", "is", "text"])
    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
    matcher.add("TEST", None, pattern)
    matches = matcher(doc)
    # Uncommenting this caused a segmentation fault
    assert len(matches) == 1
Пример #7
0
    def __init__(self, nlp, label="GPE"):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get("https://restcountries.eu/rest/v2/all")
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c["name"]: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add("COUNTRIES", None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension("is_country", default=False)
        Token.set_extension("country_capital", default=False)
        Token.set_extension("country_latlng", default=False)
        Token.set_extension("country_flag", default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension("has_country", getter=self.has_country)
        Span.set_extension("has_country", getter=self.has_country)
    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
Пример #9
0
def add_token_extension(force=False):
    """
    Extend spaCy's :py:class:`spacy.tokens.Token` with attributes for
    sentiment specific data.

    This should be called only once during the runtime of the application.
    If multiple calls cannot be avoided, use ``force=True`` to prevent spaCy
    from rejecting to redundant setting.
    """
    Token.set_extension('topic', default=None, force=force)
    Token.set_extension('rating', default=None, force=force)
    Token.set_extension('is_negation', default=False, force=force)
    Token.set_extension('is_intensifier', default=False, force=force)
    Token.set_extension('is_diminisher', default=False, force=force)
    def process(self, docOfSentence, nlp, optionalObject=None):
        docOfSentence, sentenceEnding = self.removeAllSpacesAndPunctiationMarksAtEndOfSentence(
            docOfSentence)

        Token.set_extension("isMainClause", default=False, force=True)
        Token.set_extension("predicateIsAtBegin", default=True, force=True)
        Token.set_extension("shouldBeLowercase", default=False, force=True)
        Token.set_extension("belongsToPreviousPart", default=False, force=True)
        Span.set_extension("isMainClause", default=False, force=True)
        Span.set_extension("predicateIsAtBegin", default=True, force=True)
        Span.set_extension("shouldBeLowercase", default=False, force=True)
        Span.set_extension("belongsToPreviousPart", default=False, force=True)

        # Zerlegung des Satzes in seine Haupt- und Nebensätze
        allSentenceParts = self.splitAndCategorizeSentenceParts(docOfSentence)
        if allSentenceParts == None:
            return None

        # Verbindung von Nebensätzen zu deren Hauptsätzen
        relatedMainAndDependentClauses = self.relateSentenceToEachOther(
            allSentenceParts)
        if relatedMainAndDependentClauses == None:
            return None

        # Vertauschung der Haupt- und Nebensätze durchführen
        possibleVariations = self.changeMainAndDependentClauses(
            relatedMainAndDependentClauses, sentenceEnding)

        # Rückgabe der Variationen
        return possibleVariations
 def __init__(self, spacy_pipeline, labels):
     """
     :param spacy_pipeline: An existing spaCy pipeline
     :param labels: The subset of labels from the gold annotations to restrict labeling to.
     """
     super().__init__(
         component_name=self.name,
         dependencies=self.dependencies
     )
     self.nlp = spacy_pipeline
     self.labels = labels
     self.failed_overlay_count = 0
     self.failed_identifying_span_count = 0
     Token.set_extension('gold_label', default="O", force=True)
Пример #12
0
def test_issue_1971_4(en_vocab):
    """Test that pattern matches correctly with multiple extension attribute
    values on a single token.
    """
    Token.set_extension("ext_a", default="str_a", force=True)
    Token.set_extension("ext_b", default="str_b", force=True)
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=["this", "is", "text"])
    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
    matcher.add("TEST", None, pattern)
    matches = matcher(doc)
    # Uncommenting this caused a segmentation fault
    assert len(matches) == 1
    assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
Пример #13
0
def test_matcher_superset_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{
        "MORPH": {
            "IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]
        }
    }]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 0
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 1

    # IS_SUPERSET with more than one value only matches for MORPH
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0

    # IS_SUPERSET with one value is the same as ==
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1

    # IS_SUPERSET with an empty value matches everything
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 3

    # IS_SUPERSET with a list value
    Token.set_extension("ext", default=[])
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A", "B"]
    assert len(matcher(doc)) == 1
Пример #14
0
    def __init__(self, nlp, path=HUNSPELL_PROFILE):
        if path in DEFAULT_DICTIONARY_PATHS:
            default_path = DEFAULT_DICTIONARY_PATHS[path]
            dic_path, aff_path = (
                os.path.join(default_path, 'en_US.dic'),
                os.path.join(default_path, 'en_US.aff'),
            )
        else:
            assert len(path) == 2, 'Include two paths: dic_path and aff_path'
            dic_path, aff_path = path

        self.hobj = HunSpell(dic_path, aff_path)

        Token.set_extension('hunspell_spell', default=None)
        Token.set_extension('hunspell_suggest', getter=self.get_suggestion)
Пример #15
0
    def __init__(self, nlp, semantic):
        elements = semantic.get_all_values()
        self.label = nlp.vocab.strings[self.name]

        patterns = [nlp(org) for org in elements]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(self.name, None, *patterns)

        Token.set_extension(self.extension, default=False, force=True)
        Doc.set_extension(self.extension,
                          getter=self.has_quantifier,
                          force=True)
        Span.set_extension(self.extension,
                           getter=self.has_quantifier,
                           force=True)
Пример #16
0
    def __init__(self, spacy_pipeline):
        self.nlp = spacy_pipeline
        Token.set_extension('feature_is_volume_unit', default=False)
        self.nlp.entity.add_label('volume_unit')
        self.volume_matcher = Matcher(self.nlp.vocab)

        self.volume_matcher.add('UNIT_OF_VOLUME', None, [{
            'LOWER': 'ml'
        }], [{
            'ORTH': 'dL'
        }], [{
            'LOWER': 'cc'
        }], [{
            'ORTH': 'L'
        }])
Пример #17
0
def sentences_gen(labels):
    for label in labels:
        doc = nlp(gendocs(label))

        for i, sent in enumerate(doc.sents):

            res = []
            for j, token in enumerate(sent):
                Token.set_extension('lemma', getter=lemma_getter, force=True)
                if not token.is_punct and not token.is_digit and not token.is_space:
                    tok = token._.lemma.lower()
                    tok = tok.replace('.', '')
                    res.append(tok)
            # print(sent)
            yield res
Пример #18
0
def test_doc_retokenize_split_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    with doc.retokenize() as retokenizer:
        heads = [(doc[0], 1), doc[1]]
        underscore = [{"a": True, "b": "1"}, {"b": "2"}]
        attrs = {"lemma": ["los", "angeles"], "_": underscore}
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].lemma_ == "los"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1].lemma_ == "angeles"
    assert doc[1]._.a is False
    assert doc[1]._.b == "2"
def darcolor(doc):
	try:
		Token.set_extension('plot', default={})
	except:
		pass
	for token in doc:
		node_label = '{0} [{1}] /{2})'.format(token.orth_, token.i, token.pos_)
		token._.plot['label'] = node_label
		if token.pos_ == 'VERB':
			token._.plot['color'] = 'green'
		elif token.pos_=='PROPN':
			token._.plot['color']='red'
		elif token.pos_=='NOUN':
			token._.plot['color']='blue'
	return doc
Пример #20
0
def test_custom_attribute(text):
    from spacy.tokens import Token

    fruit_getter = lambda token: token.text in ("apple", "pear", "banana")
    Token.set_extension("is_fruit", getter=fruit_getter)
    doc = dframcy.nlp(text)
    dataframe = dframcy.to_dataframe(doc,
                                     columns=["id", "start", "end"],
                                     custom_attributes=["is_fruit"])
    results = pd.DataFrame({
        "token_start": [0, 2, 7, 10],
        "token_end": [1, 6, 9, 15],
        "token_is_fruit": [False, False, False, True],
    })
    assert_frame_equal(dataframe, results)
Пример #21
0
 def test_call_lexicon_component(self):
     """
     Test running a doc through the lexicon component and properly overlaying features from
     the lexicon.
     :return:
     """
     lexicon_component = LexiconComponent(self.nlp, self.lexicon)
     self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'),
                   False)
     self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'),
                   False)
     doc = lexicon_component(self.doc)
     self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'), True)
     self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'),
                   True)
Пример #22
0
    def __init__(self, nlp, pattern_list, match_id='FALSE_DATE', label='FALSE_DATE', regex_pat=regex_pat):
        # register a new token extension to flag false_date tokens

        self.label = nlp.vocab.strings[label]  # get entity label ID
        self.orig_label = nlp.vocab.strings['DATE']  # get entity label ID for date
        Token.set_extension('is_false_date', default=False, force=True)
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(match_id, None, pattern_list)
        self.regex_pat = regex_pat
        self.nlp=nlp

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_false_date == True.
        Doc.set_extension('has_false_date', getter=self.has_false_date, force=True)
        Span.set_extension('has_false_date', getter=self.has_false_date, force=True)
Пример #23
0
    def _task_add_metadata_per_doc(self, key, data, default):
        logger.debug('worker `%s`: adding metadata per document' % self.name)

        attr_name = 'meta_' + key
        Token.set_extension(attr_name, default=default)

        for doc in self._docs:
            meta_vals = data.get(doc._.label, [default] * len(doc))
            assert sum(doc.user_data['mask']) == len(meta_vals)
            for t, v, m in zip(doc, meta_vals, doc.user_data['mask']):
                if m:
                    setattr(t._, attr_name, v)

        if key not in self._metadata_attrs:
            self._metadata_attrs[key] = default
Пример #24
0
def test_doc_retokenize_split_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    with doc.retokenize() as retokenizer:
        heads = [(doc[0], 1), doc[1]]
        underscore = [{"a": True, "b": "1"}, {"b": "2"}]
        attrs = {"lemma": ["los", "angeles"], "_": underscore}
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
    assert doc[0].lemma_ == "los"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1].lemma_ == "angeles"
    assert doc[1]._.a is False
    assert doc[1]._.b == "2"
Пример #25
0
    def __init__(self, spacy_pipeline):
        self.nlp = spacy_pipeline
        Token.set_extension('feature_is_mass_unit', default=False)
        self.nlp.entity.add_label('mass_unit')
        self.mass_matcher = Matcher(self.nlp.vocab)

        self.mass_matcher.add('UNIT_OF_MASS', None,
                              [{'LOWER': 'mcg'}],
                              [{'LOWER': 'microgram'}],
                              [{'LOWER': 'micrograms'}],
                              [{'ORTH': 'mg'}],
                              [{'LOWER': 'milligram'}],
                              [{'LOWER': 'g'}],
                              [{'LOWER': 'kg'}],
                              [{'ORTH': 'mEq'}])
Пример #26
0
    def __init__(self,
                 nlp,
                 pattern_id='IPTagger',
                 attrs=('has_ipv4', 'is_ipv4', 'ipv4'),
                 force_extension=False,
                 subnets_to_keep=4):
        """Initialise the pipeline component.

        nlp (Language): The shared nlp object. Used to initialise the matcher
            with the shared `Vocab`, and create `Doc` match patterns.
        pattern_id (unicode): ID of match pattern, defaults to 'IPTagger'. Can be
            changed to avoid ID clashes.
        attrs (tuple): Attributes to set on the ._ property. Defaults to
            ('has_ipv4', 'is_ipv4', 'ipv4').
        force_extension (bool): Force creation of extension objects.
        subnets_to_keep (int): Number of subnets to include in lemmatization.
        RETURNS (callable): A spaCy pipeline component.
        """
        self._has_ipv4, self._is_ipv4, self._ipv4 = attrs
        self.matcher = Matcher(nlp.vocab)

        if (subnets_to_keep < 1) or (subnets_to_keep > 4):
            raise ValueError('Subnets_to_keep must be in the range 1-4')
        self.subnets_to_keep = subnets_to_keep

        # Add IPv4 rule to matcher
        self._ipv4_re = re.compile(ipv4_expr, re.VERBOSE | re.I | re.UNICODE)
        ipv4_mask = lambda text: bool(self._ipv4_re.match(text))
        ipv4_flag = nlp.vocab.add_flag(ipv4_mask)
        self.matcher.add('IPV4', None, [{ipv4_flag: True}])

        # Add attributes
        # Need to force since extensions are global by default
        Doc.set_extension(self._has_ipv4,
                          getter=self.has_ipv4,
                          force=force_extension)
        Doc.set_extension(self._ipv4,
                          getter=self.iter_ipv4,
                          force=force_extension)
        Span.set_extension(self._has_ipv4,
                           getter=self.has_ipv4,
                           force=force_extension)
        Span.set_extension(self._ipv4,
                           getter=self.iter_ipv4,
                           force=force_extension)
        Token.set_extension(self._is_ipv4,
                            default=False,
                            force=force_extension)
Пример #27
0
    def __init__(
        self,
        nlp: Language,
        merge_spans: bool = True,
        lookup: Optional[Dict[str, str]] = None,
        pattern_id: str = "EMOJI",
        attrs: Tuple[str, str, str, str] = DEFAULT_ATTRS,
        force_extension: bool = True,
    ) -> None:
        """Initialise the pipeline component.

        nlp (Language): The shared nlp object. Used to initialise the matcher
            with the shared `Vocab`, and create `Doc` match patterns.
        attrs (tuple): Attributes to set on the ._ property. Defaults to
            ('has_emoji', 'is_emoji', 'emoji_desc', 'emoji').
        pattern_id (unicode): ID of match pattern, defaults to 'EMOJI'. Can be
            changed to avoid ID clashes.
        merge_spans (bool): Merge spans containing multi-character emoji. Will
            only merge combined emoji resulting in one icon, not sequences.
        lookup (dict): Optional lookup table that maps emoji unicode strings
            to custom descriptions, e.g. translations or other annotations.
        RETURNS (callable): A spaCy pipeline component.
        """
        self._has_emoji, self._is_emoji, self._emoji_desc, self._emoji = attrs
        self.merge_spans = merge_spans
        self.lookup = lookup or {}
        self.matcher = PhraseMatcher(nlp.vocab)
        emoji_patterns = list(nlp.tokenizer.pipe(EMOJI.keys()))
        self.matcher.add(pattern_id, None, *emoji_patterns)
        # Add attributes
        Doc.set_extension(self._has_emoji,
                          getter=self.has_emoji,
                          force=force_extension)
        Doc.set_extension(self._emoji,
                          getter=self.iter_emoji,
                          force=force_extension)
        Span.set_extension(self._has_emoji,
                           getter=self.has_emoji,
                           force=force_extension)
        Span.set_extension(self._emoji,
                           getter=self.iter_emoji,
                           force=force_extension)
        Token.set_extension(self._is_emoji,
                            default=False,
                            force=force_extension)
        Token.set_extension(self._emoji_desc,
                            getter=self.get_emoji_desc,
                            force=force_extension)
Пример #28
0
def _get_dep_noun(tag: Token) -> str:
    f: Dict[str, Any] = tag._.knp_morph_tag._.knp_tag_element.features
    if "係" not in f:
        return "dep"
    k = f["係"] if f["係"] != "未格" or "解析格" not in f else f["解析格"] + "格"
    x = {
        "隣": "nmod",
        "文節内": "compound",
        "ガ格": "nsubj",
        "ヲ格": "obj",
        "ガ2格": "dislocated",
    }
    if k in x:
        return x[k]
    elif k == "ノ格":
        if tag.head.pos in {VERB, ADJ}:
            return "nsubj"
        elif tag.pos in {DET, PRON}:
            tag.pos = DET
            return "det"
        else:
            return "nummod" if tag.pos == NUM else "nmod"
    elif "並列タイプ" in f:
        if tag.head.pos in {VERB, ADJ}:
            return "obl"
        else:
            return "conj"
    return "obl"
Пример #29
0
    def __init__(self, nlp, lemma_sequences, attribute, label, name, merge=False):
        self.name = name
        self.nlp = nlp
        self.label = label
        self.attribute = attribute
        self.matcher = Matcher(self.nlp.vocab)
        self.merge = merge

        # Build patterns from sequences of lemmas read from the lexicon file
        for lemmas in lemma_sequences:
            pattern = []
            for lemma in lemmas.split():
                pattern.append({LEMMA: lemma})
            self.matcher.add(label, None, pattern)

        Token.set_extension(attribute, default=False, force=True)
Пример #30
0
    def __init__(self, nlp, attr_name="concept_tag"):
        """Create a new ConceptTagger.
        Params:
            nlp: A spaCy Language model.
            attr_name (str): The name of the attribute to set to tokens.
        """
        self.nlp = nlp
        self.attr_name = attr_name
        self.target_matcher = TargetMatcher(nlp, add_ents=False)
        self.rules = []

        # If the token attribute hasn't been set, add it now
        try:
            Token.set_extension(attr_name, default="")
        except:
            pass
Пример #31
0
 def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='',
              attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')):
     self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
     self.keyword_processor = KeywordProcessor()
     self.keyword_processor.add_keywords_from_list(keywords_list)
     self.keyword_processor.add_keywords_from_dict(keywords_dict)
     if keywords_file:
         self.keyword_processor.add_keyword_from_file(keywords_file)
     self.label = label
     # Add attributes
     Doc.set_extension(self._has_entities, getter=self.has_entities)
     Doc.set_extension(self._entities, getter=self.iter_entities)
     Span.set_extension(self._has_entities, getter=self.has_entities)
     Span.set_extension(self._entities, getter=self.iter_entities)
     Token.set_extension(self._is_entity, default=False)
     Token.set_extension(self._entity_desc, getter=self.get_entity_desc)
Пример #32
0
    def __init__(self):

        # register Token attributes if they are not registered already
        from spacy.tokens import Token

        for attr_name in [
                "speaker", "start_time", "end_time", "confidence",
                "entity_linking", "addressee"
        ]:
            if not Token.has_extension(attr_name):
                Token.set_extension(attr_name, default=None)

        # register Span attributes if they are not registered already
        from spacy.tokens import Span

        if not Span.has_extension("speaker"):
            Span.set_extension("speaker", getter=self.span_speaker)

        if not Span.has_extension("start_time"):
            Span.set_extension("start_time", getter=self.span_start_time)

        if not Span.has_extension("end_time"):
            Span.set_extension("end_time", getter=self.span_end_time)

        if not Span.has_extension("confidence"):
            Span.set_extension("confidence",
                               getter=self.span_average_confidence)

        if not Span.has_extension("entity_linking"):
            Span.set_extension("entity_linking",
                               getter=self.span_entity_linking)

        if not Span.has_extension("addressee"):
            Span.set_extension("addressee", getter=self.span_addressee)

        # minimalist spaCy pipeline (used only for its tokenizer)
        self.tokenizer = spacy.load("en_core_web_sm",
                                    disable=["tagger", "parser", "ner"])

        # custom spaCy pipeline (that adds forced alignment attributes and ensures
        # that a new sentence starts at every speaker change)
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.add_pipe(self.placeholder,
                          name="forced_alignment",
                          first=True)
        self.nlp.add_pipe(self.start_sentence_at_speaker_change,
                          after="forced_alignment")
    def __call__(self, doc):
        NSUBJ = 429
        DOBJ = 416

        # doc._.set("word_pair", None)
        for sentence in doc.sents:
            word_pairs = []

            for chunk in sentence.noun_chunks:
                if not chunk.root.head.pos == VERB or not (
                        chunk.root.dep == NSUBJ or chunk.root.dep == DOBJ):
                    continue

                if not (chunk.root.head.is_stop and chunk.root.is_stop):
                    continue

                noun_norm = chunk.root.text if chunk.root.pos == PRON else chunk.root.lemma_
                noun = Token(noun_norm, chunk.root)
                verb = Token(chunk.root.head.lemma_, chunk.root.head)
                word_pair = WordPair(verb, noun)
                word_pair.noun_chunk = chunk

                # word pair vectorized
                word_pair = self.nlp.w2v(word_pair)

                if not word_pair.has_vector:
                    continue

                # word pair clustered
                word_pair = self.v2c(word_pair)

                # SPS identification
                word_pair.sps = self.word_freq.sps(word_pair.verb.cluster)

                # SA identification
                word_pair.sa = 0
                for word in chunk:
                    word_pair.sa += self.word_freq.sa(word_pair.verb.cluster,
                                                      word.cluster)

                word_pair.sa = word_pair.sa / len(chunk)

                word_pairs.append(word_pair)

            sentence._.set("word_pairs", word_pairs)

        return doc
Пример #34
0
def test_underscore_mutable_defaults_dict(en_vocab):
    """Test that mutable default arguments are handled correctly (see #2581)."""
    Token.set_extension("mutable", default={})
    token1 = Doc(en_vocab, words=["one"])[0]
    token2 = Doc(en_vocab, words=["two"])[0]
    token1._.mutable["foo"] = "bar"
    assert len(token1._.mutable) == 1
    assert token1._.mutable["foo"] == "bar"
    assert len(token2._.mutable) == 0
    token1._.mutable["foo"] = "baz"
    assert len(token1._.mutable) == 1
    assert token1._.mutable["foo"] == "baz"
    token1._.mutable["x"] = []
    token1._.mutable["x"].append("y")
    assert len(token1._.mutable) == 2
    assert token1._.mutable["x"] == ["y"]
    assert len(token2._.mutable) == 0
Пример #35
0
 def __init__(self, nlp):
     # register a new token extension to flag bad HTML
     Token.set_extension('bad_html', default=False, force=True)
     self.matcher = Matcher(nlp.vocab)
     self.matcher.add('BAD_HTML', None, [{
         'ORTH': '<'
     }, {
         'LOWER': 'br'
     }, {
         'ORTH': '>'
     }], [{
         'ORTH': '<'
     }, {
         'LOWER': 'br/'
     }, {
         'ORTH': '>'
     }])
Пример #36
0
def in_compound(tok: Token):
    """Returns true if the spacy token is part of a compound phrase"""

    if tok.dep_ == "compound":
        return True
    elif tok.i > 0 and tok.nbor(-1).dep_ == "compound":
        return True
    return False
Пример #37
0
 def __init__(self,
              data_dir=DATA_DIR,
              lefff_file_name=LEFFF_FILE_NAME,
              after_melt=False):
     LOGGER.info('New LefffLemmatizer instantiated.')
     # register your new attribute token._.lefff_lemma
     Token.set_extension('lefff_lemma', default=None)
     #In memory lemma mapping
     self.lemma_dict = {}
     self.after_melt = after_melt
     with io.open(os.path.join(data_dir, lefff_file_name),
                  encoding='utf-8') as lefff_file:
         LOGGER.info('Reading lefff data...')
         for line in lefff_file:
             els = line.split('\t')
             self.lemma_dict[(els[0], els[1])] = els[2]
     LOGGER.info('Successfully loaded lefff lemmatizer')
Пример #38
0
def test_underscore_mutable_defaults_dict(en_vocab):
    """Test that mutable default arguments are handled correctly (see #2581)."""
    Token.set_extension("mutable", default={})
    token1 = Doc(en_vocab, words=["one"])[0]
    token2 = Doc(en_vocab, words=["two"])[0]
    token1._.mutable["foo"] = "bar"
    assert len(token1._.mutable) == 1
    assert token1._.mutable["foo"] == "bar"
    assert len(token2._.mutable) == 0
    token1._.mutable["foo"] = "baz"
    assert len(token1._.mutable) == 1
    assert token1._.mutable["foo"] == "baz"
    token1._.mutable["x"] = []
    token1._.mutable["x"].append("y")
    assert len(token1._.mutable) == 2
    assert token1._.mutable["x"] == ["y"]
    assert len(token2._.mutable) == 0
Пример #39
0
def test_issue1971(en_vocab):
    # Possibly related to #2675 and #2671?
    matcher = Matcher(en_vocab)
    pattern = [
        {"ORTH": "Doe"},
        {"ORTH": "!", "OP": "?"},
        {"_": {"optional": True}, "OP": "?"},
        {"ORTH": "!", "OP": "?"},
    ]
    Token.set_extension("optional", default=False)
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
    # We could also assert length 1 here, but this is more conclusive, because
    # the real problem here is that it returns a duplicate match for a match_id
    # that's not actually in the vocab!
    matches = matcher(doc)
    assert all([match_id in en_vocab.strings for match_id, start, end in matches])
Пример #40
0
def test_doc_retokenize_merge_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    # Test regular merging
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert doc[0].lemma_ == "hello world"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    # Test bulk merging
    doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
        retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1]._.a is None
    assert doc[1]._.b == "2"
    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
Пример #42
0
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
Пример #43
0
def main(test_data_dir, experiment_dir, corpus):
    Token.set_extension("split_start", getter=get_token_split_start)
    Token.set_extension("split_end", getter=get_token_split_end)
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
    lang.ru.Russian.Defaults.use_pymorphy2 = False

    nlp = load_nlp(experiment_dir, corpus)

    treebank_code = nlp.meta["treebank"]
    for section in ("test", "dev"):
        if section == "dev":
            section_dir = "conll17-ud-development-2017-03-19"
        else:
            section_dir = "conll17-ud-test-2017-05-09"
        text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
        udpipe_path = (
            test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
        )
        gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")

        header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
        print("\t".join(header))
        inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
        for input_type in ("udp", "raw"):
            input_path = inputs[input_type]
            output_path = (
                experiment_dir / corpus / "{section}.conllu".format(section=section)
            )

            parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)

            accuracy = print_results(input_type, test_scores)
            acc_path = (
                experiment_dir
                / corpus
                / "{section}-accuracy.json".format(section=section)
            )
            srsly.write_json(acc_path, accuracy)
Пример #44
0
        str(i + 1),
        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        "_",
        str(head),
        token.dep_.lower(),
        "_",
        "_",
    ]
    lines.append("\t".join(fields))
    return "\n".join(lines)


Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)


##################
# Initialization #
##################


def load_nlp(corpus, config):
    lang = corpus.split("_")[0]
    nlp = spacy.blank(lang)
    if config.vectors:
        nlp.vocab.from_disk(config.vectors / "vocab")
    return nlp