def write_visualization(names: list, fpath: Path, out_path: Path, taxonomy, tti): # Now look for named entities nlp = English() docx = nlp('\n'.join(names)) matcher, nlp = build_phrase_matcher(nlp, taxonomy) matches = matcher(docx) match_spans = [] for match_id, start, end in matches: rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR' span = docx[start: end] # get the matched slice of the doc # print(rule_id, span.text) # create a new Span for each match and use the match_id (ANIMAL) as the label span = Span(docx, start, end, label=match_id) match_spans.append(span) docx.ents = list(docx.ents) + filter_spans(match_spans) # doc11.ents = list(doc11.ents) + [span] # add span to doc.ents html = create_visualization2(docx, False) # print(len(html)) # fname = f'{datetime.now().strftime("%m%d%y_%H%M%S")}.html' abbrev = circle_abbrev_from_path(fpath) out_path = out_path / f'{abbrev}-{fpath.suffix[1:]}-spacy.html' # print(out_path) tti.save_visualization(out_path, html)
def create_a_labeled_jsonl_dataset(pattern_file_path, jsonl_datafile, annotated_jsonl_datafile): nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) with open(pattern_file_path, "r") as fr: for idx, line in enumerate(fr): pattern_json = srsly.json_loads(line) pattern = pattern_json["pattern"] label = pattern_json["label"] # pattern_name = "_".join([x["lower"] for x in pattern]) matcher.add(label, [pattern]) with open(annotated_jsonl_datafile, "w") as fw: with open(jsonl_datafile, "r") as fr: for idx, line in enumerate(fr): line_json = srsly.json_loads(line) line_nlp = nlp(line_json["text"]) matches = matcher(line_nlp) spans = [] for match_id, start, end in matches: span = Span(line_nlp, start, end, label=match_id) spans.append(span) spans = filter_spans(spans) if spans: print(f"{idx}, spans({len(spans)}):{spans}") spans_dicts_list = _spans_to_spans_dicts_list(spans) line_json["spans"] = spans_dicts_list fw.write(json.dumps(line_json) + "\n")
def _proc(self, doc: Doc, pattern: Union[Pattern, str], label: str) -> Doc: spans = self.get_spans(doc, pattern, label or self._DEFAULT_LABEL) doc.ents = filter_spans(tuple(spans) + doc.ents) # type: ignore # TODO: https://github.com/python/mypy/issues/3004 if self.merge: merge_spans(doc, spans) return doc
def phrasesExtraction(sentence): ''' This function is responsible to extract the Noun and Verb Phrases from the given sentence ''' # instantiate a Matcher instance matcher = Matcher(nlp.vocab) matcher.add("Verb phrase", None, pattern) doc = nlp(sentence) # call the matcher to find matches matches = matcher(doc) spans = [doc[start:end] for _, start, end in matches] # print('NP: ',list(doc.noun_chunks)) # print ('VP: ',filter_spans(spans)) for np in list(doc.noun_chunks): sentence = sentence.replace(str(np), 'NP') for vp in filter_spans(spans): sentence = sentence.replace(str(vp), 'VP') sentence = sentence.replace('(NP)', 'NP') sentence = sentence.replace('(VP)', 'VP') # print (sentence) return sentence
def chunk_selection(doc: Doc) -> Iterable[Candidate]: """Get keywords candidates from noun chunks and entities. Args: doc (Doc): doc. Returns: Iterable[Candidate] """ surface_forms = [] spans = list(doc.ents) ent_words: Set[str] = set() sentence_indices = [] for span in spans: ent_words.update(token.i for token in span) for np in doc.noun_chunks: # https://github.com/explosion/sense2vec/blob/c22078c4e6c13038ab1c7718849ff97aa54fb9d8/sense2vec/util.py#L105 while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): np = np[1:] if not any(w.i in ent_words for w in np): spans.append(np) for sent in doc.sents: sentence_indices.append((sent.start, sent.end)) for span in filter_spans(spans): for i, token_indices in enumerate(sentence_indices): if span.start >= token_indices[0] and span.end <= token_indices[1]: surface_forms.append((i, span)) break return _merge_surface_forms(surface_forms)
def __call__(self, doc: Doc) -> Doc: for sent in doc.sents: blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES)) mlist = blist.mrph_list() tlist = blist.tag_list() for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]): sent._.set(getattr(KNP_USER_KEYS, comp).list_, l) if len(mlist) != len(sent): t, m = None, None for t, m in zip(sent, mlist): if t.text != m.midasi: break raise ValueError( f"""Internal error occured Sentence: {sent.text} mlist : {[m.midasi for m in mlist]} tokens: {[t.text for t in sent]} diff : {m.midasi}, {t.text} """ ) for m, token in zip(mlist, sent): token._.set(KNP_USER_KEYS.morph.element, m) doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc))) # type: ignore doc.noun_chunks_iterator = knp_noun_chunker # type: ignore # TODO: https://github.com/python/mypy/issues/3004 return doc
def match_nounphrases(self, doc_): ''' Get noun phrases based on POS patterns Arguments: text_ - text as one string to have noun phrases extracted from Returns: keyphrases - list of all noun phrases ''' # find matches, remove overlaps matches = self.matcher(doc_) if not matches: return matches spans = [doc_[start:end] for _, start, end in matches] spans = filter_spans(spans) # remove overlaps spans = [span.text.strip() for span in spans] # keep text only spans = [s[1:] if s.startswith('-') else s for s in spans] # sometimes, '-' is first char short_spans = [] # split spans with > 2 nouns for span in spans: if len(span.split()) >= 5: short_spans.extend(self.split_long_nps(span)) else: short_spans.append(span) stack = [short_spans[0]] # remove duplicates, but keep order for span in short_spans[1:]: if span not in stack: stack.append(span) stack = [w for w in stack if w not in self.single_stopwords] return stack
def moodys_merge_noun_chunks(doc): """ Merge noun chunks into a single token. Modified from sources of: - https://github.com/cemoody/lda2vec/blob/master/lda2vec/preprocess.py - https://spacy.io/api/pipeline-functions#merge_noun_chunks :params doc: Doc object. :returns: Doc object with merged noun chunks. """ bad_deps = ('amod', 'compound') if not doc.is_parsed: return doc with doc.retokenize() as retokenizer: for np in filter_spans(list(doc.noun_chunks)): # Only keep adjectives and nouns, e.g. "good ideas" while len(np) > 1 and np[0].dep_ not in bad_deps: np = np[1:] if len(np) > 1: # Merge NPs attrs = {"tag": np.root.tag, "dep": np.root.dep} retokenizer.merge(np, attrs=attrs) return doc
def match_spans(parsed_verses, matcher): """For every verse, apply custom matcher rules and isolate the set of relevant spans which match the tense rules and map to verse reference tuples. The identified spans can then later be matched with words from the verbs dictionaries. """ verse2spans = collections.defaultdict(dict) for trans, ref_tuples in parsed_verses.items(): for ref_tuple, spacy_doc in ref_tuples.items(): matches = matcher(spacy_doc) # retrieve Spacy Span objects # and give them tense tags spans = [] for m_id, start, end in matches: span = spacy_doc[start:end] span._.tense_tag = nlp.vocab.strings[m_id] correct_span(span) spans.append(span) filtered_spans = filter_spans(spans) # filter out overlapping spans; keep longest attach_span(filtered_spans) # ensure tokens are mapped to their matched span for span in filtered_spans: bequeath_tense(span) # save positive matches; unmatched verses will # be recognized later if filtered_spans: verse2spans[trans][ref_tuple] = filtered_spans else: continue return verse2spans
def test_filter_spans(doc): # Test filtering duplicates spans = [doc[1:4], doc[6:8], doc[1:4], doc[10:14]] filtered = filter_spans(spans) assert len(filtered) == 3 assert filtered[0].start == 1 and filtered[0].end == 4 assert filtered[1].start == 6 and filtered[1].end == 8 assert filtered[2].start == 10 and filtered[2].end == 14 # Test filtering overlaps with longest preference spans = [doc[1:4], doc[1:3], doc[5:10], doc[7:9], doc[1:4]] filtered = filter_spans(spans) assert len(filtered) == 2 assert len(filtered[0]) == 3 assert len(filtered[1]) == 5 assert filtered[0].start == 1 and filtered[0].end == 4 assert filtered[1].start == 5 and filtered[1].end == 10
def get_matches_in_proper_format(text, label_and_terms, nlp): """ Match the terms of an entity/label in a text and return them in the format for the NER model. We use PhraseMatcher to find words or phrases in texts based on patterns. """ matched_spans = [] for label, terms in label_and_terms.items(): # Initialize the PhraseMatcher with the vocabulary matcher = PhraseMatcher(nlp.vocab, attr='LOWER') terms = set(terms) # Add the pattern to the matcher patterns = [nlp.make_doc(text) for text in terms] matcher.add("TerminologyList", patterns) # Get the Doc from the text doc = nlp(text) # Find all sequences matching the supplied patterns on the Doc matches = matcher(doc) # Get the spans matched for match_id, start, end in matches: span = doc[start:end] # Update the custom attribute of the span to use it later span._.label = label matched_spans.append(span) # Remove overlaps. The (first) longest span is preferred over shorter spans matched_spans_filtered = filter_spans(matched_spans) entities = [] for span in matched_spans_filtered: # Get the info of the match needed for the format of data match_info_in_text = (span.start_char, span.end_char, span._.label) entities.append(match_info_in_text) return entities
def main(model: str, origin_jsonl_path: str, label_by_model_jsonl_path: str): nlp = spacy.load(model) origin_jsonl_path = Path(origin_jsonl_path) label_by_model_jsonl_path = Path(label_by_model_jsonl_path) print( f"annotate with {model}:\n{origin_jsonl_path}->{label_by_model_jsonl_path}" ) with open(label_by_model_jsonl_path, "w") as fw: with open(origin_jsonl_path, "r") as fr: for count_lines, _ in enumerate(tqdm(fr)): pass with open(origin_jsonl_path, "r") as fr: for idx, line in enumerate(tqdm(fr, total=count_lines + 1)): line_json = srsly.json_loads(line) line_json["spans"] = list( ) # delete any existing spans (labels) line_nlp = nlp(line_json["text"]) spans = [] for ent in line_nlp.ents: span = Span(line_nlp, ent.start, ent.end, label=ent.label_) spans.append(span) spans = filter_spans( spans ) # useless line, NER model should not output problematic spans # if spans: # print(f"{idx}, spans({len(spans)}):{spans}") spans_dicts_list = _spans_to_spans_dicts_list(spans) line_json["spans"] = spans_dicts_list fw.write(json.dumps(line_json) + "\n")
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") msg.text("Preprocessing text...") texts = [line.rstrip() for line in open(in_file, 'r')] docs = nlp.pipe(texts, n_process=n_process) output_file = output_path / f"{input_path.stem}.s2v" lines_count = 0 words_count = 0 wn_lemmas = set(wordnet.all_lemma_names()) with output_file.open("w", encoding="utf8") as f: for doc in tqdm.tqdm(docs, desc="Docs", unit=""): # print(doc) spans = get_phrases(doc, wn_lemmas) spans = filter_spans(spans) # print('NOUN SPAN', str(spans)) doc = merge_phrases(doc, spans) spans = get_adjective_phrases(doc) spans = filter_spans(spans) # print('ADJ SPAN', str(spans)) # print('*-----------------------------------------*') doc = merge_phrases(doc, spans) words = [] for token in doc: if not token.is_space: word, sense = make_spacy_key(token, prefer_ents=True) words.append(make_key(word, sense)) f.write(" ".join(words) + "\n") lines_count += 1 words_count += len(words) msg.good( f"Successfully preprocessed {lines_count} docs ({words_count} words)", output_file.resolve(), )
def _merge_doc(doc): from spacy.util import filter_spans spans = list(doc.ents) + list(doc.noun_chunks) spans = filter_spans(spans) with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) return doc
def __call__(self, doc: Doc) -> Doc: ents = [ Span(doc, start=s, end=e, label="CUSTOM") for _, s, e in self.matcher(doc) ] ents = filter_spans(ents) doc.set_ents(ents) return doc
def filter_matches(text_spacy, matches): spans_orig = [text_spacy[start:end] for _, start, end in matches] spans_filtered = filter_spans(spans_orig) match_filter_indx = [ index for index, item in enumerate(spans_orig) if item in spans_filtered ] matches_filtered = [matches[indx] for indx in match_filter_indx] return matches_filtered
def extract_people(doc): # Merge entities and noun chunks into one token spans = list(doc.ents) + list(doc.noun_chunks) spans = util.filter_spans(spans) with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) #Get the folks people = [str(person) for person in filter(lambda w: w.ent_type_ == "PERSON", doc)] return people
def to_format(text, data, label): """ function formats data to standard Spacy models """ res = [] for word in data: position = find_position(text, word) for pos_start, pos_end in position: if pos_start + pos_end != 0: res.append(Span([pos_start, pos_end, label])) return [i.get_tuple() for i in filter_spans(res)]
def test_issue6207(en_tokenizer): doc = en_tokenizer("zero one two three four five six") # Make spans s1 = doc[:4] s2 = doc[3:6] # overlaps with s1 s3 = doc[5:7] # overlaps with s2, not s1 result = util.filter_spans((s1, s2, s3)) assert s1 in result assert s2 not in result assert s3 in result
def featurize(self, text: str, limit: bool = False) -> DefaultDict[Any, Any]: doc = self.nlp(text) # Extract tokens containing... # (1) Abbreviations abbrev_tokens = [] for abrv in doc._.abbreviations: abbrev_tokens.append(str(abrv._.long_form).split()) abbrev_tokens_flattened = [t for at in abbrev_tokens for t in at] # (2) Entities entities = [str(e) for e in doc.ents] entity_tokens = [e.split() for e in entities] entity_tokens_flattened = [t for et in entity_tokens for t in et] # (3) Noun phrases np_tokens = [] for chunk in doc.noun_chunks: np_tokens.append(str(chunk.text).split()) np_tokens_flattened = [t for et in np_tokens for t in et] # (4) Verb phrases verb_matches = self.verb_matcher(doc) spans = [doc[start:end] for _, start, end in verb_matches] vp_tokens = filter_spans(spans) vp_tokens_flattened = [str(t) for et in vp_tokens for t in et] # Limit the samples. if limit: doc = doc[:limit] # Aggregate all features together. features: DefaultDict[str, List[Union[int, str]]] = defaultdict(list) for token in doc: features["tokens"].append(str(token.text)) features["pos"].append(str(token.tag_)) # previously token.pos_ features["head"].append(str(token.head)) # (Note: the following features are binary lists indicating the presence of a # feature or not per token, like "[1 0 0 1 1 1 0 0 ...]") features["entities"].append(1 if token.text in entity_tokens_flattened else 0) features["np"].append(1 if token.text in np_tokens_flattened else 0) features["vp"].append(1 if token.text in vp_tokens_flattened else 0) features["abbreviation"].append(1 if token.text in abbrev_tokens_flattened else 0) return features
def merge_phrases(doc: Doc) -> Doc: """Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. doc (Doc): The document to merge phrases in. RETURNS (Doc): The Doc with merged tokens. """ spans = get_phrases(doc) spans = filter_spans(spans) with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) return doc
def __call__(self, doc: Doc) -> Doc: """ Slightly modified from spacy.pipeline.function.merge_entities to accommodate stopword trimming. """ with doc.retokenize() as retokenizer: # Merge discovered entities / noun chunks. # Ones found via `PipedPhraseMatcher` have label "CUSTOM" ents = [ ent for ent in doc.ents if self.filter_entities is None or ent.label_ in self.filter_entities ] custom = set(tok.i for ent in ents for tok in ent if ent.label_ == "CUSTOM") noun_chunks = [] if doc.has_annotation("DEP"): # ensure precedence of CUSTOM phrases noun_chunks = [ noun for noun in doc.noun_chunks if not any(tok.i in custom for tok in noun) ] # eliminate overlapping spans, keeping the longest # NB that, given earlier filtering, CUSTOM phrases should never be subsumed/ # broken up phrases = filter_spans([ p for p in ents + noun_chunks if p.label_ == "CUSTOM" or len(p) <= self.max_phrase_len ]) for phrase in phrases: attrs = { "tag": phrase.root.tag, "dep": phrase.root.dep, "ent_type": phrase.label, } # need to trim leading/trailing stopwords if phrase.label_ != "CUSTOM" and self.stopwords is not None: while phrase and phrase[0].lower_ in self.stopwords: phrase = phrase[1:] while phrase and phrase[-1].lower_ in self.stopwords: phrase = phrase[:-1] if not phrase: continue retokenizer.merge(phrase, attrs=attrs) return doc
def __call__(self, doc: Doc) -> Doc: for sent in doc.sents: blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES)) mlist = blist.mrph_list() tlist = blist.tag_list() if len(mlist) != len(sent): mlist = _separate_mrph(mlist, sent) for label, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]): sent._.set(getattr(KNP_USER_KEYS, comp).list_, label) for m, token in zip(mlist, sent): token._.set(KNP_USER_KEYS.morph.element, m) doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc))) doc.noun_chunks_iterator = knp_noun_chunker return doc
def __call__(self, doc: Doc) -> Doc: for sent in doc.sents: blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES)) mlist = blist.mrph_list() tlist = blist.tag_list() for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]): sent._.set(getattr(KNP_USER_KEYS, comp).list_, l) assert len(mlist) == len(sent) for m, token in zip(mlist, sent): token._.set(KNP_USER_KEYS.morph.element, m) doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc))) # type: ignore # TODO: https://github.com/python/mypy/issues/3004 return doc
def __call__(self, doc: Doc) -> Doc: matches = self.get_char_spans(doc.text) spans = [] for i, j, text in matches: span = get_doc_char_span(doc, i, j, destructive=self.destructive, label=self.get_label(text)) if span: spans.append(span) [s.text for s in spans ] # TODO: resolve the evaluation bug and remove this line ents = filter_spans(doc.ents + tuple(spans)) doc.ents = tuple(ents) return doc
def __call__(self, doc: Doc) -> Doc: ''' This method will find all temporal connectives and store them in an iterable. Parameters: doc(Doc): A Spacy document. ''' matches = self._matcher(doc) temporal_connectives_spans = [doc[start:end] for _, start, end in matches] doc._.temporal_connectives_span_indices = [{'start': span.start, 'end': span.end, 'label': span.label} for span in filter_spans(temporal_connectives_spans)] # Save the temporal connectives found return doc
def to_spacy(self, df, file_path=None): """ Function to convert dataframe returned by annotator into spacy . Parameters ---------- df (pandas DataFrame): Dataframe returned by the annotator (see Annotate()). file_path (str): Filepath (including filename) to save the .spacy file to. Returns ------- Spacy docbin if a user wants to combine additional training data """ if (not isinstance(df, pd.DataFrame)): raise TypeError("Pass the pandas dataframe returned by annotate()") if file_path and (not isinstance(file_path, str)): raise TypeError("The file_path must be a string or None") if file_path is None: file_path = os.path.join(os.getcwd(), 'annotations.spacy') db = DocBin() training_data = [ant for ant in df['annotations'].tolist() if ant] for text, annotations in training_data: ents = [] doc = self.nlp(text) for start, end, label in annotations['entities']: span = doc.char_span(start, end, label=label) ents.append(span) # Drop overlapping spans. Note: when spans overlap, the (first) longest span is preferred over shorter spans. # See: https://spacy.io/api/top-level#util.filter_spans # TODO: alert users that some spans have been dropped. doc.ents = filter_spans(ents) db.add(doc) db.to_disk(file_path) print(f"Spacy file saved to: {file_path}") return db
def __call__(self, doc: Doc) -> Doc: """Apply the pipeline component to a `Doc` object. doc (Doc): The `Doc` returned by the previous pipeline component. RETURNS (Doc): The modified `Doc` object. """ spans = self.matcher(doc, as_spans=True) for span in spans: for token in span: token._.set(self._is_emoji, True) if self.merge_spans: spans = filter_spans(spans) with doc.retokenize() as retokenizer: for span in spans: if len(span) > 1: retokenizer.merge(span) return doc
def run_spacytagger(string): """ Runs spacy on `string` and returns a list of :class:`quepy.tagger.Word` objects. """ assert_valid_encoding(string) # For now, at least, perform our own pre-processing # --to ensure terms like "presynaptic" are easily found later. string = ' '.join(string.split()) string = collapse(string) doc = nlp(string) # NOTE: spaCy expects and returns unicode spans = [doc[start:end] for match_id, start, end in matcher(doc)] filtered = filter_spans(spans) with doc.retokenize() as retokenizer: for span in filtered: retokenizer.merge(span) # tag_ is the "fine-grained" POS words = [Word(x.text, x.lemma_, x.tag_) for x in doc] # The following is only for logging purposes; if necessary, it could be removed for production logger.info(' '.join([t.text + '[' + str(t.i) + ']' for t in doc])) indent = " " longest = max(len(t.text) for t in doc) column = (len(doc) - 1) * len(indent) + longest + 2 wout = '{:' + str(column) + '}| ' def trav_tree(indents, node): logger.info( wout.format((indent * indents) + node.text) + ', '.join( [ str(x) for x in [ node.i, node.is_oov, node.lemma_, node.tag_, \ "<-"+ str(node.left_edge), str(node.right_edge) +"->"] ]) ) for el in node.children: # NOTE: Could also change display based on node.lefts and node.rights trav_tree(indents + 1, el) for sent in doc.sents: trav_tree(0, sent.root) logger.info('Ents: ' + str(doc.ents)) logger.info('NPs: ' + str(list(doc.noun_chunks))) return words
def get_matched_pos_chunks(doc, pattern): """ Get the list of chunks from the document that match the pattern Overlapping spans will be filtered and the longest ones will be returned on the list, for example, the text span a mosquitocidal Bacillus thuringiensis with PoS Tags DT JJ JJ NN will yield the following matches: DT JJ JJ NN - a mosquitocidal Bacillus thuringiensis JJ JJ NN - mosquitocidal Bacillus thuringiensis JJ NN - Bacillus thuringiensis NN - thuringiensis This function will ignore the shorter overlaps and return the longer. Not ideal but it gets the job done on this specific case. Arguments: doc: Document An annotated spacy document pattern: Dict A matcher pattern dictionary Returns: List[Span] The list matching chunks """ matcher = Matcher(nlp.vocab) matcher.add("CHUNKS", None, pattern) matches = matcher(doc) chunk_spans = list() for (i, (match_id, start, end)) in enumerate(matches): span = doc[start:end] chunk_spans.append(span) longest_spans = filter_spans(chunk_spans) return longest_spans