def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
示例#3
0
    def cohesive(self, *, punct: bool = False) -> Dict[str, int]:
        """Returns cohesive markers values.

        It process the document using a spaCy PhraseMatcher to
        finde the cohesive markers given in a list.

        Parameters:
        punct: bool - Flag to take into account punctuation.

        Returns:
        Dictionary with markers as keys and counts as values.
        """
        doc = self.doc
        matcher = self.matcher
        features: DefaultDict[str, int] = defaultdict(int)
        matches = matcher(doc)
        spans = [Span(doc, start, end) for match_id, start, end in matches]
        if punct:
            spans = self._extended_spans(spans)
        for string in (span.text.lower() for span in spans):
            features[string] += 1
        return dict(features)
示例#4
0
 def __call__(self, doc):
     labels = ["DATE", "GPE", "NORP"]
     previous_labels = [
         "CAMP", "GHETTO", "DATE", "LOCATION", "NORP", "EVENT"
     ]
     l = []
     for old_ent in doc.ents:
         if old_ent.label_ == "DATE":
             l.append(old_ent)
         elif old_ent.label_ == "GPE":
             new_ent = Span(doc,
                            old_ent.start,
                            old_ent.end,
                            label="LOCATION")
             l.append(new_ent)
         elif old_ent.label_ == "NORP":
             l.append(old_ent)
         elif old_ent.label_ in previous_labels:
             l.append(old_ent)
     l = tuple(l)
     doc.ents = l
     return (doc)
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = [
            Span(doc, match[1], match[2], label=self.all_labels)
            for match in matches
        ]
        for i, span in enumerate(spans):
            span._.set("has_ontols", True)
            for token in span:
                if span.text.lower() in self.terms:
                    token._.set("is_ontol_term", True)
                    token._.set("ontol_id",
                                self.terms[span.text.lower()]["id"])
                else:
                    print("Term not found: ", span.text.lower())

        with doc.retokenize() as retokenizer:
            for span in filter_spans(spans):
                retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
                doc._.ontols = list(doc._.ontols) + [span]

        return doc
示例#6
0
    def pipe(self, docs: Iterable[Doc]) -> Iterable[Doc]:
        """Annotates the stream of documents based on the Spacy model"""

        stream1, stream2 = itertools.tee(docs, 2)

        # Remove existing entities from the document
        stream2 = (self.create_new_doc(d) for d in stream2)

        # And run the model
        for _, proc in self.model.pipeline:
            stream2 = proc.pipe(stream2)

        for doc, doc_copy in zip(stream1, stream2):

            doc.spans[self.name] = []

            # Add the annotation
            for ent in doc_copy.ents:
                doc.spans[self.name].append(
                    Span(doc, ent.start, ent.end, ent.label_))

            yield doc
示例#7
0
 def __call__(self, doc):
     """Apply the pipeline component on a Doc object and modify it if matches
     are found. Return the Doc, so it can be processed by the next component
     in the pipeline, if available.
     """
     matches = self.matcher(doc)
     spans = []  # keep the spans for later so we can merge them afterwards
     for _, start, end in matches:
         # Generate Span representing the entity & set label
         entity = Span(doc, start, end, label=self.label)
         spans.append(entity)
         # Set custom attribute on each token of the entity
         for token in entity:
             token._.set('is_tech_org', True)
         # Overwrite doc.ents and add entity – be careful not to replace!
         doc.ents = list(doc.ents) + [entity]
     for span in spans:
         # Iterate over all spans and merge them into one token. This is done
         # after setting the entities – otherwise, it would cause mismatched
         # indices!
         span.merge()
     return doc  # don't forget to return the Doc!
示例#8
0
def test_displacy_rtl():
    # Source: http://www.sobhe.ir/hazm/ – is this correct?
    words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"]
    # These are (likely) wrong, but it's just for testing
    pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
    deps = ["foo", "bar", "foo", "baz"]
    heads = [1, 0, 1, -2]
    nlp = Persian()
    doc = get_doc(nlp.vocab,
                  words=words,
                  pos=pos,
                  tags=pos,
                  heads=heads,
                  deps=deps)
    doc.ents = [Span(doc, 1, 3, label="TEST")]
    html = displacy.render(doc, page=True, style="dep")
    assert "direction: rtl" in html
    assert 'direction="rtl"' in html
    assert 'lang="{}"'.format(nlp.lang) in html
    html = displacy.render(doc, page=True, style="ent")
    assert "direction: rtl" in html
    assert 'lang="{}"'.format(nlp.lang) in html
示例#9
0
def test_history(nlp):
    text = re.sub("\\s+", " ", """This is a story about Pierre Lison and his work at 
                  Yetanothername Inc., which is just a name we invented. But of course, 
                  Lison did not really work for Yetanothername, because it is a fictious 
                  name, even when spelled like YETANOTHERNAME.""")
    doc = nlp(text)
    annotator1 = skweak.spacy.ModelAnnotator("spacy", "en_core_web_sm")
    annotator2 = skweak.doclevel.DocumentHistoryAnnotator("hist_cased", "spacy", ["PERSON", "ORG"])
    annotator3 = skweak.doclevel.DocumentHistoryAnnotator("hist_uncased", "spacy", ["PERSON", "ORG"],
                                                          case_sentitive=False)
    doc = annotator3(annotator2(annotator1(doc)))
    assert Span(doc, 5, 7, "PERSON") in doc.spans["spacy"]
    assert Span(doc, 11, 13, "ORG") in doc.spans["spacy"]
    assert Span(doc, 26, 27, "PERSON") in doc.spans["hist_cased"]
    assert Span(doc, 32, 33, "ORG") in doc.spans["hist_cased"]
    assert Span(doc, 32, 33, "ORG") in doc.spans["hist_uncased"]
    assert Span(doc, 45, 46, "ORG") in doc.spans["hist_uncased"]
示例#10
0
    def __add_annotation(self, df, col_text, current_index, annotations):
        """
        Function to add annotations in spacy format a dataframe.

        Parameters
        ----------
        df (pandas dataframe): Dataframe with text to be labelled.
        col_text (str): Column in pandas dataframe containing text to be labelled.
        current_index (int): Index of DataFrame row to annotate.
        annotations (dict): Dictionary containing strings of annotation patterns for PhraseMatcher.

        Returns
        -------
        Labelled dataframe.
        """
        spans = []
        for label, items in annotations.items():
            if items:
                item_list = [
                    i.strip() for i in items.split(self.delimiter)
                    if i.strip() != ""
                ]
                matcher = PhraseMatcher(self.nlp.vocab, attr=self.attr)
                matcher.add(label, [self.nlp(item) for item in item_list])
                doc = self.nlp(df[col_text][current_index])
                matches = matcher(doc)
                spans_new = []
                for match_id, start, end in matches:
                    span = Span(doc, start, end, label="")
                    spans_new.append(span)
                spans_filtered = spacy.util.filter_spans(spans_new)
                spans.extend([(span.start_char, span.end_char, label)
                              for span in spans_filtered])
            else:
                continue
        entities = {"entities": spans}
        df.at[current_index,
              "annotations"] = (df[col_text][current_index], entities)
示例#11
0
    def __call__(self, doc):
        """Call TargetMatcher on a doc. If `add_ents=True`, then matched
        spans will be merged in to doc.ents and `doc` will be returned.
        If `add_ents=False`, then matched spans will be returned as a list,
        in which case this cannot be used as part of a spaCy pipeline, which
        requires each component to return the doc, but can be used as a standalone matcher.

        In addition to extracting spans of text and setting labels, TargetRules
        can also define setting custom attributes and metadata. Additionally,
        each resulting span has an attribute span._.target_rule which maps
        a span to the TargetRule which set it.
        """
        matches = self.matcher(doc)
        spans = []
        for (rule_id, start, end) in matches:
            rule = self._rule_item_mapping[self.nlp.vocab.strings[rule_id]]
            span = Span(doc, start=start, end=end, label=rule.category)
            span._.target_rule = rule
            if rule.attributes is not None:
                for (attribute, value) in rule.attributes.items():
                    try:
                        setattr(span._, attribute, value)
                    except AttributeError as e:
                        raise e
            spans.append(span)
        if self.add_ents is True:
            for span in spans:
                try:
                    doc.ents += (span, )
                # spaCy will raise a value error if the token in span are already
                # part of an entity (ie., as part of an upstream component)
                # In that case, let the existing span supersede this one
                except ValueError as e:
                    # raise e
                    pass
            return doc
        else:
            return spans
示例#12
0
    def __call__(self, doc: Doc) -> Doc:
        """Aggregates all weak supervision sources"""

        if len(doc.spans) > 0:

            # Extracting the observation data
            df = self.get_observation_df(doc)

            # Running the actual aggregation
            agg_df = self._aggregate(df)

            if "O" in self.out_labels:
                # Converting back to spans or token labels
                output_spans = utils.token_array_to_spans(
                    agg_df.values, self.out_labels)
                output_probs = utils.token_array_to_probs(
                    agg_df.values, self.out_labels)
            else:
                output_spans = agg_df.idxmax(axis=1).to_dict()
                output_probs = {
                    span: {
                        label: prob
                        for label, prob in distrib.items() if prob > 0.1
                    }
                    for span, distrib in agg_df.to_dict(
                        orient="index").items()
                }

            # Storing the results (both as spans and with the full probs)
            doc.spans[self.name] = [
                Span(doc, start, end, label=label)
                for (start, end), label in output_spans.items()
            ]
            doc.spans[self.name].attrs["probs"] = output_probs
            doc.spans[self.name].attrs["aggregated"] = True
            doc.spans[self.name].attrs["sources"] = list(df.columns)

        return doc
def main():
    nlp = spacy.load('en_core_web_sm')

    # create lists for building a doc
    print('Create Doc #1: -------------------------')
    words = ['spaCy', 'is', 'useful', 'for', 'NLP', '!']
    spaces = [True, True, True, True, False, False]

    # create a doc from the words and spaces lists
    # and pass in the vocab
    doc = Doc(nlp.vocab, words=words, spaces=spaces)
    print(doc.text)
    print('\n')

    # create lists for building another doc
    print('Create Doc #2: -------------------------')
    words2 = ['Jazz', 'Winston', 'is', 'the', 'best', '!']
    spaces2 = [True, True, True, True, False, False]

    # create a doc from the words and spaces lists
    # and pass in the vocab
    doc2 = Doc(nlp.vocab, words=words2, spaces=spaces2)
    print(doc2.text)
    print('\n')

    # create a span for 'Jazz Winston' from the doc
    # and assign it the label 'DOG'
    span = Span(doc2, 0, 2, label='DOG')
    print(span.text, span.label_)

    # add the span to the doc's entities
    doc2.ents = [span]

    # print entities' text and labels
    print([(ent.text, ent.label_) for ent in doc2.ents])

    # end program
    print('\nDone.')
示例#14
0
def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab

    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()

    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]

    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels

    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)

        nlp2 = English(vocab)
        ner2 = EntityRecognizer(vocab)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
示例#15
0
def test_matcher_sets_return_correct_tokens(matcher):
    patterns = [
        [{
            "LOWER": {
                "IN": ["zero"]
            }
        }],
        [{
            "LOWER": {
                "IN": ["one"]
            }
        }],
        [{
            "LOWER": {
                "IN": ["two"]
            }
        }],
    ]
    matcher.add("TEST", patterns)
    doc = Doc(matcher.vocab, words="zero one two three".split())
    matches = matcher(doc)
    texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
    assert texts == ["zero", "one", "two"]
示例#16
0
    def merge_entities(self, e1, e2, doc, keep_first_label=True):
        """
        Take two neighbouring entities and merge them into one span (almost an entity).
        
        e1 : first entity (nlp(text).ents[1])
        e2 : second entity
        keep_first_label : if we keep as label the one of the first entity, otherwise the second
        """
        if keep_first_label:
            #consider, from looking at French, that the label of the reunion of the entities separated by an '
            #is the label of the first one : this is why it's default
            new_label_ = e2.label_  #(unicode)
            #new_label = e2.label # get hash value of entity label (int)
        else:
            new_label_ = e1.label_
            #new_label = e1.label

        #create a Span with the start and end index of the token, not the start and end index of the entity in the document
        #start and end are the token offset, while start_char and end_char are the character offset
        start_token = e1.start
        end_token = e2.end
        new_entity = Span(doc, start_token, end_token, label_=new_label_)
        return new_entity
示例#17
0
def tokensfromdoc(doc):
    d = nlp(doc)
    matches = matcher(d)
    for match_id, start, end in matches:
        term = Span(d, start, end, label='myterms')
        d.ents = list(d.ents) + [term]
    tokens = [w.lemma_ for w in d
              # no pronouns
              if w.pos_ != 'PRON'   \
              # no punctuations

              and w.pos_ != 'PUNCT' \
              # not Beginning of a named entity

              and w.ent_iob_ != 'B' \
              # not Inside a named entity

              and w.ent_iob_ != 'I' \
              # not a stop word

              and not w.is_stop]
    tokens += [de.string.rstrip() for de in d.ents]
    return tokens
示例#18
0
def crecord2spans(cr, nlp, ntr):
    """
    Get token offsets using char offsets and merge entity annotations in Doc with spacy's using ner_type_resolver (ntr)
    :param cr: RelationRecord
    :param nlp: spacy model
    :param ntr: NERTypeResolver
    :return: Tuple[spacy.token.Span, spacy.token.Span]
    """
    doc = nlp(cr.context)
    spans = chars2spans(doc, cr.s_spanr, cr.o_spanr)
    uris = [cr.subject, cr.object]

    _v_ = nlp.vocab.strings
    true_ents = []
    for span, uri in zip(spans, uris):
        label_str = ntr.get_by_uri(uri, default_type=span.label_)  # fallback to spacy's ner tag
        if not label_str in _v_:
            log.info('crecord2spans: unknown entity type: "{}"'.format(label_str))
        label_id = _v_[label_str]  # modify StringStore if necessary
        true_ents.append(Span(doc=doc, start=span.start, end=span.end, label=label_id))
    corrected_ents = merge_ents_offsets(true_ents, doc.ents)
    doc.ents = corrected_ents
    return true_ents
示例#19
0
def bunsetu_span(token: Token) -> Span:
    bunsetu_bi_list = bunsetu_bi_labels(token.doc)
    start = token.i
    end = start + 1
    for idx in range(start, 0, -1):
        if bunsetu_bi_list[idx] == "B":
            start = idx
            break
    else:
        start = 0
    doc_len = len(token.doc)
    for idx in range(end, doc_len):
        if bunsetu_bi_list[idx] == "B":
            end = idx
            break
    else:
        end = doc_len

    doc = token.doc
    return Span(doc,
                start=start,
                end=end,
                label=POS_PHRASE_MAP.get(doc[start:end].root.pos_, ""))
示例#20
0
    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.

        :param doc: text to be analysed
        :return: text updated with the tags and the entities matched
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity and set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_metro_fault', True)
            # Overwrite doc.ents and add entity
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token.
            span.merge()
        return doc
示例#21
0
def Entity():
    print("\nThe outcomes of Entity Extraction are:")
    doc = nlp(u"京东CEO刘强东在美国明尼苏达涉嫌性侵女大学生。")
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))

    from spacy.tokens import Span
    doc = nlp(u"奶茶妹妹遇见VP就有90%的几率1位出道……")
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))
    augment = [Span(doc, 0, 1, label=doc.vocab.strings[u'WORK_OF_ART'])]
    doc.ents = list(doc.ents) + augment
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))
示例#22
0
    def __call__(self, doc, entities):
        spans = []
        matches = self.matcher(doc)
        for label, start, end in matches:
            span = doc[start:end]
            if (span[0].ent_type == label):  # if not already labeled
                continue

            # es only: try extending the match (compound)
            compound_expanded = False
            if (span.root.lang_ == 'es'):
                span = self._try_expand_compound(span, label)
                if span:
                    spans.append(span)
                    compound_expanded = True

            if (False == compound_expanded):
                span = Span(doc, start, end, label=label)
                spans.append(span)

        if (len(spans) > 0):
            doc.ents = list(doc.ents) + spans
        return doc
示例#23
0
def ingest_json_document(doc_json: Mapping, nlp: Language) -> Doc:
    doc = nlp(doc_json["text"])
    token_idx_list = [token.idx for token in doc]
    annotations = sorted(doc_json["labels"], key=lambda x: x[0])
    if len(annotations) == 0 and doc_json["annotation_approver"] is None:
        # raise ValueError
        pass
    first = True
    ents = []
    for annotation in annotations:
        character_start, character_end, label = annotation
        ent_start = bisect_right(token_idx_list, character_start) - 1
        ent_end = bisect_left(token_idx_list, character_end)
        if first:
            first = False
        else:
            ent_start = max(ent_start, ents[-1].end)
            # if ent_start >= ent_end:
            #     raise ValueError
        if ent_start < ent_end:
            ents.append(Span(doc, ent_start, ent_end, label))
    doc.ents = ents
    return doc
示例#24
0
    def __call__(self, doc):
        doc_low = Doc(self.nlp.vocab,
                      words=[t.lower_ for t in doc],
                      spaces=[t.whitespace_ for t in doc])
        #print ("doc_low:", doc_low)
        # matches = self.matcher(doc_low)
        # print ("matches1:", matches)
        # # filter matches for overlaps (keep longest span)
        # negation_terms = self.filter_matches(matches)
        # print ("negation terms1:", negation_terms)

        #POS trigger
        matches = self.matcher1(doc_low)
        # filter matches for overlaps (keep longest span)
        negation_terms = self.filter_matches(matches)

        #PRE triger
        matches2 = self.matcher2(doc_low)
        # filter matches for overlaps (keep longest span)
        negation_terms2 = self.filter_matches(matches2)

        negation_terms += negation_terms2

        doc._.negs = [
            Span(doc, start, end, label=rule_tag)
            for rule_tag, start, end in negation_terms
        ]

        #print (">>>", doc._.negs)

        for neg_span in doc._.negs:
            for token in neg_span:
                token._.negation_type_ = neg_span.label

        self.compute_negations(negation_terms, doc)

        return doc
def evaluate(ner_model, examples):
    scorer = Scorer()
    stopper = 0

    for input, annot in examples:
        doc_gold_text = ner_model.make_doc(input)
        doc = ner_model(input)
        arrayOFEntities = [(ent.text, ent.label_) for ent in doc.ents]
        gold = GoldParse(doc_gold_text, entities=annot)
        print("entities is ", arrayOFEntities)
        print("sentence", input)
        print("annot is", annot)
        #for first,last,entity in annot:
        #print("word is",input[int(first):int(last)])
        #print("gold ner ",gold.ner)
        #modifiedGoldNer(input,arrayOFEntities,gold.ner)
        #print("gold ner after modified ",gold.ner)
        stopper += 1
        pred_value = ner_model(input)
        print(type(pred_value.ents))
        span = Span(pred_value, 20, 21, 0)

        print("span type", type(span), span.label_)

        for ent in pred_value.ents:
            print("pred value is", type(ent))
            print(ent.text)
            print(ent.text, "after modified")
            print(ent.label_)
            print(ent.start)
            print(ent.end)
            #print(pred_value[ent.start:ent.end])
        #pred_value.ents=span
        print("after modification", pred_value.ents)
        print("pred_value", pred_value)
        scorer.score(pred_value, gold)
    return scorer.scores
示例#26
0
def ingest_json_document(doc_json: Mapping,
                         nlp: Language,
                         include_other: bool,
                         is_predict=False) -> Doc:
    if is_predict:
        doc = nlp(doc_json["title"] + "\n" + doc_json["text"])
        doc.user_data["date"] = doc_json["date"]
        doc.user_data["url"] = doc_json["url"]
        doc.user_data["newspaper"] = doc_json["newspaper"]
        doc.user_data["categoria"] = doc_json["categoria"]
        doc.ents = []
        return doc
    else:
        if not doc_json["annotation_approver"] and not doc_json["labels"]:
            raise ValueError("Instance is not annotated!")
        else:
            doc = nlp(doc_json["text"])
            spans = list()
            #print(doc_json)
            for label in doc_json["labels"]:
                #print(doc_json["text"])
                if include_other or label[2] != "OTHER":
                    if doc_json["annotation_approver"] != "lazaro":
                        start_char = label[0]
                        end_char = label[1]
                        tag = label[2]
                        token_start = get_starting_token(start_char, doc)
                        token_end = get_ending_token(end_char, doc)
                    else:
                        token_start = label[0]
                        token_end = label[1]
                        tag = label[2]
                    if token_start is None or token_end is None:
                        raise ValueError("Token alignment impossible!")
                    spans.append(Span(doc, token_start, token_end, tag))
            doc.ents = spans
        return doc
示例#27
0
    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        ruler = self.ruler(doc)  # execute the ruler

        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in self.ruler.matcher(doc):
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            # Can be extended with other data returned by the API, like
            # lei_code, country
            for token in entity:
                token._.set("is_lei", True)

        with doc.retokenize() as retokenizer:
            # Iterate over all spans en merge spans of multiple tokens into one single token
            for span in spans:
                retokenizer.merge(span)

        return doc
示例#28
0
def build_docx(lines: List[str], tti: TaxonomyTokenIdentify):
    # nlp = tti.nlp
    matcher = tti.get_phrase_matcher()

    docx = tti.spacify_text('\n'.join(lines), False)  # No fuzzy matching

    matches = matcher(docx)
    match_spans = []

    for match_id, start, end in matches:
        try:
            # rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
            # span = docx[start: end]  # get the matched slice of the doc
            #     print(rule_id, span.text)

            # create a new Span for each match and use the match_id (ANIMAL) as the label
            span = Span(docx, start, end, label=match_id)
            match_spans.append(span)
        except Exception as ee:
            print(match_id, docx[start:end], ee)

    docx.ents = set(list(docx.ents)) | set(filter_spans(match_spans))

    return docx
def test_issue4313():
    """ This should not crash or exit with some strange error code """
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    ner.add_label("SOME_LABEL")
    nlp.initialize()
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
    assert "SOME_LABEL" in ner.labels
    apple_ent = Span(doc, 5, 6, label="MY_ORG")
    doc.ents = list(doc.ents) + [apple_ent]

    # ensure the beam_parse still works with the new label
    docs = [doc]
    ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)
    assert len(ner.labels) == 2
    assert "MY_ORG" in ner.labels
示例#30
0
 def __call__(self, doc: Doc) -> Doc:
     """
     Annotate the document with noun phrase spans
     """
     spans = []
     doc_vecs = []
     doc_chars = []
     doc_lens = []
     if len(doc) < 1:
         return doc
     for sentence in doc.sents:
         features = self._feature_extractor([t.text for t in sentence])
         if isinstance(features, tuple):
             doc_vec = features[0]
             doc_chars.append(features[1])
         else:
             doc_vec = features
         doc_vecs.append(doc_vec)
         doc_lens.append(len(doc_vec))
     doc_vectors = pad_sentences(np.asarray(doc_vecs))
     inputs = doc_vectors
     if self.char_vocab:
         max_len = doc_vectors.shape[1]
         padded_chars = np.zeros(
             (len(doc_chars), max_len, self.model.max_word_len))
         for idx, d in enumerate(doc_chars):
             d = d[:max_len]
             padded_chars[idx, -d.shape[0]:] = d
         inputs = [inputs, padded_chars]
     np_indexes = self._infer_chunks(inputs, doc_lens)
     for s, e in np_indexes:
         np_span = Span(doc, s, e)
         spans.append(np_span)
     spans = _NPPostprocessor.process(spans)
     set_noun_phrases(doc, spans)
     return doc
示例#31
0
def test_matcher_sets_return_correct_tokens(en_vocab):
    matcher = Matcher(en_vocab)
    patterns = [
        [{
            'LOWER': {
                'IN': ["zero"]
            }
        }],
        [{
            'LOWER': {
                'IN': ["one"]
            }
        }],
        [{
            'LOWER': {
                'IN': ["two"]
            }
        }],
    ]
    matcher.add('TEST', None, *patterns)
    doc = Doc(en_vocab, words="zero one two three".split())
    matches = matcher(doc)
    texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
    assert texts == ['zero', 'one', 'two']
示例#32
0
def test_get_spans(nlp_small):

    doc = nlp_small(
        "This is just a small test for checking that the method works correctly"
    )
    doc.spans["source1"] = [
        Span(doc, 0, 2, label="LABEL1"),
        Span(doc, 4, 5, label="LABEL2")
    ]
    doc.spans["source2"] = [
        Span(doc, 0, 1, label="LABEL3"),
        Span(doc, 2, 6, label="LABEL2")
    ]
    doc.spans["source4"] = [Span(doc, 0, 2, label="LABEL2")]
    doc.spans["source3"] = [
        Span(doc, 7, 9, label="LABEL2"),
        Span(doc, 1, 4, label="LABEL1")
    ]

    assert set(
        (span.start, span.end)
        for span in utils.get_spans(doc, ["source1", "source2"])) == {(0, 2),
                                                                      (2, 6)}
    assert set(
        (span.start, span.end)
        for span in utils.get_spans(doc, ["source1", "source3"])) == {(1, 4),
                                                                      (4, 5),
                                                                      (7, 9)}
    assert {(span.start, span.end): span.label_
            for span in utils.get_spans(doc, ["source1", "source4"])} == {
                (0, 2): "LABEL2",
                (4, 5): "LABEL2"
            }
    assert set(
        (span.start, span.end)
        for span in utils.get_spans(doc, ["source2", "source3"])) == {(0, 1),
                                                                      (2, 6),
                                                                      (7, 9)}
示例#33
0
def test_span_kb_id_readonly(doc):
    span = Span(doc, 0, 1)
    with pytest.raises(NotImplementedError):
        span.kb_id_ = "Q342"
示例#34
0
def test_span_label_readonly(doc):
    span = Span(doc, 0, 1)
    with pytest.raises(NotImplementedError):
        span.label_ = "hello"