Пример #1
0
def write_visualization(names: list, fpath: Path, out_path: Path, taxonomy, tti):
    # Now look for named entities
    nlp = English()
    docx = nlp('\n'.join(names))

    matcher, nlp = build_phrase_matcher(nlp, taxonomy)
    matches = matcher(docx)
    match_spans = []
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
        span = docx[start: end]  # get the matched slice of the doc
        #     print(rule_id, span.text)

        # create a new Span for each match and use the match_id (ANIMAL) as the label
        span = Span(docx, start, end, label=match_id)
        match_spans.append(span)

    docx.ents = list(docx.ents) + filter_spans(match_spans)
    #     doc11.ents = list(doc11.ents) + [span]  # add span to doc.ents

    html = create_visualization2(docx, False)
    # print(len(html))
    # fname = f'{datetime.now().strftime("%m%d%y_%H%M%S")}.html'

    abbrev = circle_abbrev_from_path(fpath)
    out_path = out_path / f'{abbrev}-{fpath.suffix[1:]}-spacy.html'
    # print(out_path)

    tti.save_visualization(out_path, html)
Пример #2
0
def create_a_labeled_jsonl_dataset(pattern_file_path, jsonl_datafile,
                                   annotated_jsonl_datafile):
    nlp = spacy.load("en_core_web_sm")
    matcher = Matcher(nlp.vocab)
    with open(pattern_file_path, "r") as fr:
        for idx, line in enumerate(fr):
            pattern_json = srsly.json_loads(line)
            pattern = pattern_json["pattern"]
            label = pattern_json["label"]
            # pattern_name = "_".join([x["lower"] for x in pattern])
            matcher.add(label, [pattern])

    with open(annotated_jsonl_datafile, "w") as fw:
        with open(jsonl_datafile, "r") as fr:
            for idx, line in enumerate(fr):
                line_json = srsly.json_loads(line)
                line_nlp = nlp(line_json["text"])
                matches = matcher(line_nlp)
                spans = []
                for match_id, start, end in matches:
                    span = Span(line_nlp, start, end, label=match_id)
                    spans.append(span)
                spans = filter_spans(spans)
                if spans:
                    print(f"{idx}, spans({len(spans)}):{spans}")
                spans_dicts_list = _spans_to_spans_dicts_list(spans)

                line_json["spans"] = spans_dicts_list
                fw.write(json.dumps(line_json) + "\n")
Пример #3
0
 def _proc(self, doc: Doc, pattern: Union[Pattern, str], label: str) -> Doc:
     spans = self.get_spans(doc, pattern, label or self._DEFAULT_LABEL)
     doc.ents = filter_spans(tuple(spans) + doc.ents)  # type: ignore
     # TODO: https://github.com/python/mypy/issues/3004
     if self.merge:
         merge_spans(doc, spans)
     return doc
Пример #4
0
def phrasesExtraction(sentence):
    '''
    This function is responsible to extract
    the Noun and Verb Phrases from the given 
    sentence
    '''
    # instantiate a Matcher instance
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", None, pattern)

    doc = nlp(sentence)
    # call the matcher to find matches
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]

    # print('NP: ',list(doc.noun_chunks))
    # print ('VP: ',filter_spans(spans))

    for np in list(doc.noun_chunks):
        sentence = sentence.replace(str(np), 'NP')
    for vp in filter_spans(spans):
        sentence = sentence.replace(str(vp), 'VP')
    sentence = sentence.replace('(NP)', 'NP')
    sentence = sentence.replace('(VP)', 'VP')
    # print (sentence)
    return sentence
Пример #5
0
def chunk_selection(doc: Doc) -> Iterable[Candidate]:
    """Get keywords candidates from noun chunks and entities.

    Args:
        doc (Doc): doc.

    Returns:
        Iterable[Candidate]
    """
    surface_forms = []
    spans = list(doc.ents)
    ent_words: Set[str] = set()
    sentence_indices = []
    for span in spans:
        ent_words.update(token.i for token in span)
    for np in doc.noun_chunks:
        # https://github.com/explosion/sense2vec/blob/c22078c4e6c13038ab1c7718849ff97aa54fb9d8/sense2vec/util.py#L105
        while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
            np = np[1:]
        if not any(w.i in ent_words for w in np):
            spans.append(np)
    for sent in doc.sents:
        sentence_indices.append((sent.start, sent.end))
    for span in filter_spans(spans):
        for i, token_indices in enumerate(sentence_indices):
            if span.start >= token_indices[0] and span.end <= token_indices[1]:
                surface_forms.append((i, span))
                break
    return _merge_surface_forms(surface_forms)
Пример #6
0
 def __call__(self, doc: Doc) -> Doc:
     for sent in doc.sents:
         blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
         mlist = blist.mrph_list()
         tlist = blist.tag_list()
         for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]):
             sent._.set(getattr(KNP_USER_KEYS, comp).list_, l)
         if len(mlist) != len(sent):
             t, m = None, None
             for t, m in zip(sent, mlist):
                 if t.text != m.midasi:
                     break
             raise ValueError(
                 f"""Internal error occured
         Sentence: {sent.text}
         mlist : {[m.midasi for m in mlist]}
         tokens: {[t.text for t in sent]}
         diff  : {m.midasi}, {t.text}
         """
             )
         for m, token in zip(mlist, sent):
             token._.set(KNP_USER_KEYS.morph.element, m)
     doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc)))  # type: ignore
     doc.noun_chunks_iterator = knp_noun_chunker  # type: ignore
     # TODO: https://github.com/python/mypy/issues/3004
     return doc
Пример #7
0
    def match_nounphrases(self, doc_):
        '''
        Get noun phrases based on POS patterns
            Arguments:
                text_      - text as one string to have noun phrases extracted from
            Returns:
                keyphrases - list of all noun phrases
        '''
        # find matches, remove overlaps
        matches = self.matcher(doc_)
        if not matches:
            return matches
        spans = [doc_[start:end] for _, start, end in matches]
        spans = filter_spans(spans)  # remove overlaps
        spans = [span.text.strip() for span in spans]  # keep text only
        spans = [s[1:] if s.startswith('-') else s
                 for s in spans]  # sometimes, '-' is first char

        short_spans = []  # split spans with > 2 nouns
        for span in spans:
            if len(span.split()) >= 5:
                short_spans.extend(self.split_long_nps(span))
            else:
                short_spans.append(span)

        stack = [short_spans[0]]  # remove duplicates, but keep order
        for span in short_spans[1:]:
            if span not in stack:
                stack.append(span)
        stack = [w for w in stack if w not in self.single_stopwords]

        return stack
Пример #8
0
    def moodys_merge_noun_chunks(doc):
        """
        Merge noun chunks into a single token.

        Modified from sources of:
        - https://github.com/cemoody/lda2vec/blob/master/lda2vec/preprocess.py
        - https://spacy.io/api/pipeline-functions#merge_noun_chunks

        :params doc: Doc object.
        :returns: Doc object with merged noun chunks.
        """
        bad_deps = ('amod', 'compound')

        if not doc.is_parsed:
            return doc
        with doc.retokenize() as retokenizer:
            for np in filter_spans(list(doc.noun_chunks)):

                # Only keep adjectives and nouns, e.g. "good ideas"
                while len(np) > 1 and np[0].dep_ not in bad_deps:
                    np = np[1:]

                if len(np) > 1:
                    # Merge NPs
                    attrs = {"tag": np.root.tag, "dep": np.root.dep}
                    retokenizer.merge(np, attrs=attrs)
        return doc
def match_spans(parsed_verses, matcher):
    """For every verse, apply custom matcher rules and
     isolate the set of relevant spans which match the tense 
    rules and map to verse reference tuples. The identified 
    spans can then later be matched with words from the verbs
    dictionaries.
    """
    verse2spans = collections.defaultdict(dict)
    for trans, ref_tuples in parsed_verses.items():
        for ref_tuple, spacy_doc in ref_tuples.items():
            matches = matcher(spacy_doc)
            
            # retrieve Spacy Span objects
            # and give them tense tags
            spans = []
            for m_id, start, end in matches:
                span = spacy_doc[start:end]
                span._.tense_tag = nlp.vocab.strings[m_id]
                correct_span(span)
                spans.append(span)
            
            filtered_spans = filter_spans(spans)  # filter out overlapping spans; keep longest
            attach_span(filtered_spans) # ensure tokens are mapped to their matched span
            for span in filtered_spans:
                bequeath_tense(span) 
            
            # save positive matches; unmatched verses will
            # be recognized later
            if filtered_spans:
                verse2spans[trans][ref_tuple] = filtered_spans
            else:
                continue

    return verse2spans
def test_filter_spans(doc):
    # Test filtering duplicates
    spans = [doc[1:4], doc[6:8], doc[1:4], doc[10:14]]
    filtered = filter_spans(spans)
    assert len(filtered) == 3
    assert filtered[0].start == 1 and filtered[0].end == 4
    assert filtered[1].start == 6 and filtered[1].end == 8
    assert filtered[2].start == 10 and filtered[2].end == 14
    # Test filtering overlaps with longest preference
    spans = [doc[1:4], doc[1:3], doc[5:10], doc[7:9], doc[1:4]]
    filtered = filter_spans(spans)
    assert len(filtered) == 2
    assert len(filtered[0]) == 3
    assert len(filtered[1]) == 5
    assert filtered[0].start == 1 and filtered[0].end == 4
    assert filtered[1].start == 5 and filtered[1].end == 10
Пример #11
0
def get_matches_in_proper_format(text, label_and_terms, nlp):
    """
    Match the terms of an entity/label in a text and return them in the 
    format for the NER model. We use PhraseMatcher to find words or phrases
    in texts based on patterns.
    """

    matched_spans = [] 
    for label, terms in label_and_terms.items():
        # Initialize the PhraseMatcher with the vocabulary 
        matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        terms = set(terms)
        # Add the pattern to the matcher
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns)
        # Get the Doc from the text
        doc = nlp(text)
        # Find all sequences matching the supplied patterns on the Doc
        matches = matcher(doc)
        # Get the spans matched
        for match_id, start, end in matches:
            span = doc[start:end]
            # Update the custom attribute of the span to use it later
            span._.label = label
            matched_spans.append(span)
            
    # Remove overlaps. The (first) longest span is preferred over shorter spans
    matched_spans_filtered = filter_spans(matched_spans)
    entities = []
    for span in matched_spans_filtered:
        # Get the info of the match needed for the format of data
        match_info_in_text = (span.start_char, span.end_char, span._.label)
        entities.append(match_info_in_text)
    return entities
Пример #12
0
def test_filter_spans(doc):
    # Test filtering duplicates
    spans = [doc[1:4], doc[6:8], doc[1:4], doc[10:14]]
    filtered = filter_spans(spans)
    assert len(filtered) == 3
    assert filtered[0].start == 1 and filtered[0].end == 4
    assert filtered[1].start == 6 and filtered[1].end == 8
    assert filtered[2].start == 10 and filtered[2].end == 14
    # Test filtering overlaps with longest preference
    spans = [doc[1:4], doc[1:3], doc[5:10], doc[7:9], doc[1:4]]
    filtered = filter_spans(spans)
    assert len(filtered) == 2
    assert len(filtered[0]) == 3
    assert len(filtered[1]) == 5
    assert filtered[0].start == 1 and filtered[0].end == 4
    assert filtered[1].start == 5 and filtered[1].end == 10
Пример #13
0
def main(model: str, origin_jsonl_path: str, label_by_model_jsonl_path: str):
    nlp = spacy.load(model)
    origin_jsonl_path = Path(origin_jsonl_path)
    label_by_model_jsonl_path = Path(label_by_model_jsonl_path)
    print(
        f"annotate with {model}:\n{origin_jsonl_path}->{label_by_model_jsonl_path}"
    )
    with open(label_by_model_jsonl_path, "w") as fw:
        with open(origin_jsonl_path, "r") as fr:
            for count_lines, _ in enumerate(tqdm(fr)):
                pass
        with open(origin_jsonl_path, "r") as fr:
            for idx, line in enumerate(tqdm(fr, total=count_lines + 1)):
                line_json = srsly.json_loads(line)
                line_json["spans"] = list(
                )  # delete any existing spans (labels)
                line_nlp = nlp(line_json["text"])
                spans = []
                for ent in line_nlp.ents:
                    span = Span(line_nlp, ent.start, ent.end, label=ent.label_)
                    spans.append(span)
                spans = filter_spans(
                    spans
                )  # useless line, NER model should not output problematic spans
                # if spans:
                #     print(f"{idx}, spans({len(spans)}):{spans}")
                spans_dicts_list = _spans_to_spans_dicts_list(spans)
                line_json["spans"] = spans_dicts_list
                fw.write(json.dumps(line_json) + "\n")
Пример #14
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    msg.text("Preprocessing text...")
    texts = [line.rstrip() for line in open(in_file, 'r')]
    docs = nlp.pipe(texts, n_process=n_process)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    wn_lemmas = set(wordnet.all_lemma_names())
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            # print(doc)
            spans = get_phrases(doc, wn_lemmas)
            spans = filter_spans(spans)
            # print('NOUN SPAN', str(spans))
            doc = merge_phrases(doc, spans)
            spans = get_adjective_phrases(doc)
            spans = filter_spans(spans)
            # print('ADJ SPAN', str(spans))
            # print('*-----------------------------------------*')
            doc = merge_phrases(doc, spans)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )
Пример #15
0
 def _merge_doc(doc):
     from spacy.util import filter_spans
     spans = list(doc.ents) + list(doc.noun_chunks)
     spans = filter_spans(spans)
     with doc.retokenize() as retokenizer:
         for span in spans:
             retokenizer.merge(span)
     return doc
Пример #16
0
 def __call__(self, doc: Doc) -> Doc:
     ents = [
         Span(doc, start=s, end=e, label="CUSTOM")
         for _, s, e in self.matcher(doc)
     ]
     ents = filter_spans(ents)
     doc.set_ents(ents)
     return doc
Пример #17
0
def filter_matches(text_spacy, matches):
    spans_orig = [text_spacy[start:end] for _, start, end in matches]
    spans_filtered = filter_spans(spans_orig)
    match_filter_indx = [
        index for index, item in enumerate(spans_orig)
        if item in spans_filtered
    ]
    matches_filtered = [matches[indx] for indx in match_filter_indx]
    return matches_filtered
Пример #18
0
def extract_people(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = util.filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    #Get the folks
    people = [str(person) for person in filter(lambda w: w.ent_type_ == "PERSON", doc)]
    return people
Пример #19
0
def to_format(text, data, label):
  """
  function formats data to standard Spacy models
  """
  res = []
  for word in data:
    position = find_position(text, word)
    for pos_start, pos_end in position:
      if pos_start + pos_end != 0:
        res.append(Span([pos_start, pos_end, label]))      

  return [i.get_tuple() for i in filter_spans(res)]
Пример #20
0
def test_issue6207(en_tokenizer):
    doc = en_tokenizer("zero one two three four five six")

    # Make spans
    s1 = doc[:4]
    s2 = doc[3:6]  # overlaps with s1
    s3 = doc[5:7]  # overlaps with s2, not s1

    result = util.filter_spans((s1, s2, s3))
    assert s1 in result
    assert s2 not in result
    assert s3 in result
Пример #21
0
    def featurize(self,
                  text: str,
                  limit: bool = False) -> DefaultDict[Any, Any]:

        doc = self.nlp(text)

        # Extract tokens containing...
        # (1) Abbreviations
        abbrev_tokens = []
        for abrv in doc._.abbreviations:
            abbrev_tokens.append(str(abrv._.long_form).split())
        abbrev_tokens_flattened = [t for at in abbrev_tokens for t in at]

        # (2) Entities
        entities = [str(e) for e in doc.ents]
        entity_tokens = [e.split() for e in entities]
        entity_tokens_flattened = [t for et in entity_tokens for t in et]

        # (3) Noun phrases
        np_tokens = []
        for chunk in doc.noun_chunks:
            np_tokens.append(str(chunk.text).split())
        np_tokens_flattened = [t for et in np_tokens for t in et]

        # (4) Verb phrases
        verb_matches = self.verb_matcher(doc)
        spans = [doc[start:end] for _, start, end in verb_matches]
        vp_tokens = filter_spans(spans)
        vp_tokens_flattened = [str(t) for et in vp_tokens for t in et]

        # Limit the samples.
        if limit:
            doc = doc[:limit]

        # Aggregate all features together.
        features: DefaultDict[str, List[Union[int, str]]] = defaultdict(list)
        for token in doc:
            features["tokens"].append(str(token.text))
            features["pos"].append(str(token.tag_))  # previously token.pos_
            features["head"].append(str(token.head))
            # (Note: the following features are binary lists indicating the presence of a
            # feature or not per token, like "[1 0 0 1 1 1 0 0 ...]")
            features["entities"].append(1 if token.text in
                                        entity_tokens_flattened else 0)
            features["np"].append(1 if token.text in
                                  np_tokens_flattened else 0)
            features["vp"].append(1 if token.text in
                                  vp_tokens_flattened else 0)
            features["abbreviation"].append(1 if token.text in
                                            abbrev_tokens_flattened else 0)

        return features
Пример #22
0
def merge_phrases(doc: Doc) -> Doc:
    """Transform a spaCy Doc to match the sense2vec format: merge entities
    into one token and merge noun chunks without determiners.

    doc (Doc): The document to merge phrases in.
    RETURNS (Doc): The Doc with merged tokens.
    """
    spans = get_phrases(doc)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
    return doc
Пример #23
0
    def __call__(self, doc: Doc) -> Doc:
        """
        Slightly modified from spacy.pipeline.function.merge_entities to accommodate
        stopword trimming.
        """
        with doc.retokenize() as retokenizer:
            # Merge discovered entities / noun chunks.
            # Ones found via `PipedPhraseMatcher` have label "CUSTOM"
            ents = [
                ent for ent in doc.ents if self.filter_entities is None
                or ent.label_ in self.filter_entities
            ]
            custom = set(tok.i for ent in ents for tok in ent
                         if ent.label_ == "CUSTOM")

            noun_chunks = []
            if doc.has_annotation("DEP"):
                # ensure precedence of CUSTOM phrases
                noun_chunks = [
                    noun for noun in doc.noun_chunks
                    if not any(tok.i in custom for tok in noun)
                ]

            # eliminate overlapping spans, keeping the longest
            # NB that, given earlier filtering, CUSTOM phrases should never be subsumed/
            # broken up
            phrases = filter_spans([
                p for p in ents + noun_chunks
                if p.label_ == "CUSTOM" or len(p) <= self.max_phrase_len
            ])

            for phrase in phrases:
                attrs = {
                    "tag": phrase.root.tag,
                    "dep": phrase.root.dep,
                    "ent_type": phrase.label,
                }
                # need to trim leading/trailing stopwords
                if phrase.label_ != "CUSTOM" and self.stopwords is not None:
                    while phrase and phrase[0].lower_ in self.stopwords:
                        phrase = phrase[1:]
                    while phrase and phrase[-1].lower_ in self.stopwords:
                        phrase = phrase[:-1]

                if not phrase:
                    continue

                retokenizer.merge(phrase, attrs=attrs)

        return doc
Пример #24
0
 def __call__(self, doc: Doc) -> Doc:
     for sent in doc.sents:
         blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
         mlist = blist.mrph_list()
         tlist = blist.tag_list()
         if len(mlist) != len(sent):
             mlist = _separate_mrph(mlist, sent)
         for label, comp in zip([blist, mlist, tlist],
                                ["bunsetsu", "morph", "tag"]):
             sent._.set(getattr(KNP_USER_KEYS, comp).list_, label)
         for m, token in zip(mlist, sent):
             token._.set(KNP_USER_KEYS.morph.element, m)
     doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc)))
     doc.noun_chunks_iterator = knp_noun_chunker
     return doc
Пример #25
0
 def __call__(self, doc: Doc) -> Doc:
     for sent in doc.sents:
         blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
         mlist = blist.mrph_list()
         tlist = blist.tag_list()
         for l, comp in zip([blist, mlist, tlist],
                            ["bunsetsu", "morph", "tag"]):
             sent._.set(getattr(KNP_USER_KEYS, comp).list_, l)
         assert len(mlist) == len(sent)
         for m, token in zip(mlist, sent):
             token._.set(KNP_USER_KEYS.morph.element, m)
     doc.ents = filter_spans(doc.ents +
                             tuple(_extract_knp_ent(doc)))  # type: ignore
     # TODO: https://github.com/python/mypy/issues/3004
     return doc
Пример #26
0
 def __call__(self, doc: Doc) -> Doc:
     matches = self.get_char_spans(doc.text)
     spans = []
     for i, j, text in matches:
         span = get_doc_char_span(doc,
                                  i,
                                  j,
                                  destructive=self.destructive,
                                  label=self.get_label(text))
         if span:
             spans.append(span)
     [s.text for s in spans
      ]  # TODO: resolve the evaluation bug and remove this line
     ents = filter_spans(doc.ents + tuple(spans))
     doc.ents = tuple(ents)
     return doc
Пример #27
0
    def __call__(self, doc: Doc) -> Doc:
        '''
        This method will find all temporal connectives and store them in an iterable.

        Parameters:
        doc(Doc): A Spacy document.
        '''
        matches = self._matcher(doc)
        temporal_connectives_spans = [doc[start:end] for _, start, end in matches]

        doc._.temporal_connectives_span_indices = [{'start': span.start,
                                                    'end': span.end,
                                                    'label': span.label}
                                                   for span in filter_spans(temporal_connectives_spans)] # Save the temporal connectives found
        
        return doc
Пример #28
0
    def to_spacy(self, df, file_path=None):
        """
        Function to convert dataframe returned by annotator into spacy .

        Parameters
        ----------
        df (pandas DataFrame): Dataframe returned by the annotator (see Annotate()).
        file_path (str): Filepath (including filename) to save the .spacy file to.
        
        Returns
        -------
        Spacy docbin if a user wants to combine additional training data
        """

        if (not isinstance(df, pd.DataFrame)):
            raise TypeError("Pass the pandas dataframe returned by annotate()")

        if file_path and (not isinstance(file_path, str)):
            raise TypeError("The file_path must be a string or None")

        if file_path is None:
            file_path = os.path.join(os.getcwd(), 'annotations.spacy')

        db = DocBin()
        training_data = [ant for ant in df['annotations'].tolist() if ant]
        for text, annotations in training_data:

            ents = []
            doc = self.nlp(text)
            for start, end, label in annotations['entities']:

                span = doc.char_span(start, end, label=label)
                ents.append(span)

            # Drop overlapping spans. Note: when spans overlap, the (first) longest span is preferred over shorter spans.
            # See: https://spacy.io/api/top-level#util.filter_spans
            # TODO: alert users that some spans have been dropped.
            doc.ents = filter_spans(ents)

            db.add(doc)

        db.to_disk(file_path)
        print(f"Spacy file saved to: {file_path}")

        return db
Пример #29
0
    def __call__(self, doc: Doc) -> Doc:
        """Apply the pipeline component to a `Doc` object.

        doc (Doc): The `Doc` returned by the previous pipeline component.
        RETURNS (Doc): The modified `Doc` object.
        """
        spans = self.matcher(doc, as_spans=True)
        for span in spans:
            for token in span:
                token._.set(self._is_emoji, True)

        if self.merge_spans:
            spans = filter_spans(spans)
            with doc.retokenize() as retokenizer:
                for span in spans:
                    if len(span) > 1:
                        retokenizer.merge(span)
        return doc
Пример #30
0
        def run_spacytagger(string):
            """
            Runs spacy on `string` and returns a list of
            :class:`quepy.tagger.Word` objects.
            """
            assert_valid_encoding(string)

            # For now, at least, perform our own pre-processing
            # --to ensure terms like "presynaptic" are easily found later.
            string = ' '.join(string.split())
            string = collapse(string)

            doc = nlp(string)  # NOTE: spaCy expects and returns unicode

            spans = [doc[start:end] for match_id, start, end in matcher(doc)]
            filtered = filter_spans(spans)
            with doc.retokenize() as retokenizer:
                for span in filtered:
                    retokenizer.merge(span)
            # tag_ is the "fine-grained" POS
            words = [Word(x.text, x.lemma_, x.tag_) for x in doc]

            # The following is only for logging purposes; if necessary, it could be removed for production
            logger.info(' '.join([t.text + '[' + str(t.i) + ']' for t in doc]))
            indent = "  "
            longest = max(len(t.text) for t in doc)
            column = (len(doc) - 1) * len(indent) + longest + 2
            wout = '{:' + str(column) + '}| '

            def trav_tree(indents, node):
                logger.info( wout.format((indent * indents) + node.text) + ', '.join(
                    [ str(x) for x in [
                        node.i, node.is_oov, node.lemma_, node.tag_, \
                        "<-"+ str(node.left_edge), str(node.right_edge) +"->"] ]) )
                for el in node.children:
                    # NOTE: Could also change display based on node.lefts and node.rights
                    trav_tree(indents + 1, el)

            for sent in doc.sents:
                trav_tree(0, sent.root)
            logger.info('Ents:  ' + str(doc.ents))
            logger.info('NPs:   ' + str(list(doc.noun_chunks)))

            return words
Пример #31
0
def get_matched_pos_chunks(doc, pattern):
    """
    Get the list of chunks from the document that match the pattern

    Overlapping spans will be filtered and the longest ones will be
    returned on the list, for example, the text span

        a mosquitocidal Bacillus thuringiensis

    with PoS Tags

        DT JJ JJ NN

    will yield the following matches:

        DT JJ JJ NN - a mosquitocidal Bacillus thuringiensis
           JJ JJ NN -   mosquitocidal Bacillus thuringiensis
              JJ NN -                 Bacillus thuringiensis
                 NN -                          thuringiensis

    This function will ignore the shorter overlaps and return the
    longer. Not ideal but it gets the job done on this specific
    case.

    Arguments:
        doc: Document
            An annotated spacy document
        pattern: Dict
            A matcher pattern dictionary

    Returns: List[Span]
        The list matching chunks
    """
    matcher = Matcher(nlp.vocab)
    matcher.add("CHUNKS", None, pattern)
    matches = matcher(doc)
    chunk_spans = list()

    for (i, (match_id, start, end)) in enumerate(matches):
        span = doc[start:end]
        chunk_spans.append(span)

    longest_spans = filter_spans(chunk_spans)
    return longest_spans