def create_chunks(self, spndx: Spandex, subchunks: Iterable[Annotation], outcomes: Iterable[str]): """ attrib_func - function to parse tags and apply functions to output chunk type """ chunks = [] texts = [spndx.spanned_text(span) for span, _ in subchunks] prev_prefix = "O" prev_label = "" for text, subchunk, outcome in zip(texts, subchunks, outcomes): prefix, label = self.parse_outcome(outcome) if prefix != "O": if label != prev_label: span = Span(begin=subchunk.begin, end=subchunk.end) chunk = self.chunk_type() self.attrib_func(chunk, outcome) chunks.append((span, chunk)) else: chunk.end = subchunk.end else: chunk = None prev_prefix, prev_label = prefix, label spndx.add_layer(self.chunk_type, chunks)
def process(self, spndx: Spandex, **kwargs): """ Args: **kwargs: Keyword Arguments Keyword Args: annotation_layers (:obj:`AnnotationLayer`): Bitwise mask of AnnotationLayers indicating which layers to populate in Spandex. Default value is AnnotationLayers.ALL() window_type (str or type): Class Type of object to run processing over. A common use case would be to run on boundaries already defined prior to processing. For example processing a document by subsection boundaries Default of None means to process the full contents of the Spandex. """ # FIXME, better in init or as kwargs? # window_type = kwargs.get('window_type', None) annotation_layers = kwargs.get('annotation_layers', AnnotationLayers.ALL()) if not self.window_type: # process full document spacy_doc = self.spacy_pipeline(spndx.content_string) SpacyToSpandexUtils.spacy_to_spandex(spacy_doc, spndx, annotation_layers) else: # process over windows for window in spndx.select(self.window_type): window_text = spndx.spanned_text(window) spacy_doc = self.spacy_pipeline(window_text) SpacyToSpandexUtils.spacy_to_spandex(spacy_doc, spndx, annotation_layers, window)
def __iter__(self): for uri in self.uris: spndx = Spandex() view = spndx.create_view(constants.SPANDEX_URI_VIEW, content_string=uri, content_mime="text/uri") yield spndx
def copy_view(spndx: Spandex, src_viewname: str, tgt_viewname: str): src_view = spndx.get_view(src_viewname) tgt_view = spndx.get_or_create_view(tgt_viewname) tgt_view.content_string = src_view.content_string tgt_view.content_mime = src_view.content_mime return tgt_view
def uri_to_spndx(uri, viewname=None): if not viewname: viewname = constants.SPANDEX_DEFAULT_VIEW url = urllib.request.urlparse(uri) fh = open(uri) if not url.scheme else urllib.request.urlopen(uri) spndx = Spandex() view = spndx.get_or_create_view(viewname) view.content_string = fh.read() view.content_mime = "text/plain" return spndx
def process(self, spndx: Spandex, **kwargs): if self.window_type: windows = [ window.span for window in spndx.select(self.window_type) ] else: windows = [Span(0, len(spndx.content_string))] annotations = [] for window in windows: window_text = spndx.spanned_text(window) matches = list(self.split_re.finditer(window_text)) if not matches: # no split found so make the whole window paragraph span = Span(begin=window.begin + 0, end=window.begin + len(spndx.content_string)) annotation = self.annotation_type(begin=span.begin, end=span.end) annotations.append(annotation) else: if matches[0].span()[0] > 0: span = Span(begin=window.begin + 0, end=window.begin + matches[0].span()[0]) annotation = self.annotation_type(begin=span.begin, end=span.end) annotations.append(annotation) for m0, m1 in zip(matches[0:-1], matches[1:]): span = Span(begin=window.begin + m0.span()[1], end=window.begin + m1.span()[0]) annotation = self.annotation_type(begin=span.begin, end=span.end) annotations.append(annotation) if matches[-1].span()[1] <= len(window_text): # get straggling span span = Span(begin=window.begin + matches[-1].span()[1], end=window.begin + len(spndx.content_string)) annotation = self.annotation_type(begin=span.begin, end=span.end) annotations.append(annotation) spndx.add_annotations(*annotations)
def process(self, spndx: Spandex, **kwargs): if self.window_type: windows = [ window.span for window in spndx.select(self.window_type) ] else: windows = [Span(0, len(spndx.content_string))] annotations = [] for window in windows: window_text = spndx.spanned_text(window) for match in self.match_re.finditer(window_text): mbeg, mend = match.span() annotation = self.annotation_type() annotation.span = Span(begin=window.begin + mbeg, end=window.begin + mend) annotations.append(annotation) spndx.add_annotations(*annotations)
def map_subchunks_to_outcome(self, spndx: Spandex, chunks: Iterable[Annotation]): subchunk_outcomes = {} for chunk_span, chunk in chunks: for i, (subchunk_span, subchunk) in enumerate( spndx.covered(self.subchunk_type, chunk_span)): tag = 'B' if i == 0 else 'I' suffix = self.suffix_func(chunk) subchunk_outcomes[(subchunk_span.begin, subchunk_span.end)] = f"{tag}{suffix}" return subchunk_outcomes
def spacy_to_spandex(spacy_doc, spndx=None, annotation_layers=AnnotationLayers.ALL(), window_span=None): if not spndx: spndx = Spandex(spacy_doc.text_with_ws) if annotation_layers & AnnotationLayers.DOCUMENT: if window_span: doc = Document(begin=window_span.begin, end=window_span.end) else: doc_span = Span(0, len(spndx.content_string)) doc = Document(begin=doc_span.begin, end=doc_span.end) spndx.add_annotations(doc) if annotation_layers & AnnotationLayers.SENTENCE: spndx.add_annotations(*[ SpacyToSpandexUtils.convert_sentence(s, window_span) for s in spacy_doc.sents ]) # Extract tokens and dependency parse spacy_toks = [t for t in spacy_doc] if annotation_layers & AnnotationLayers.TOKEN: all_toks = [ SpacyToSpandexUtils.convert_token(t, window_span) for t in spacy_toks ] word_toks = [(tok, spacy_tok) for (tok, spacy_tok) in zip(all_toks, spacy_toks) if not spacy_tok.is_space] toks = [tok for (tok, spacy_tok) in word_toks] spndx.add_annotations(*toks) if annotation_layers & AnnotationLayers.DEPPARSE: # Pull out dependency graphs span_to_nodes = { tok.span: DependencyNode(begin=tok.begin, end=tok.end) for tok in toks } depedges = [] depnodes = [] depnode_spans = set() for (tok, spacy_tok) in word_toks: headtok = all_toks[spacy_tok.head.i] head_node = span_to_nodes[headtok.span] child_span = tok.span child_node = span_to_nodes[child_span] # get span for full dependency depspan = Span(begin=min(tok.begin, headtok.begin), end=max(tok.end, headtok.end)) # Build edges depedge = DependencyEdge(label=spacy_tok.dep_, head=head_node, child=child_node) depedge.span = depspan child_node.head_edge = depedge head_node.child_edges.append(depedge) if headtok.span not in depnode_spans: depnodes.append(head_node) depnode_spans.add(head_node.span) if child_span not in depnode_spans: depnodes.append(child_node) depnode_spans.add(child_span) depedges.append(depedge) # push dependency graph onto spandex spndx.add_annotations(*depedges) spndx.add_annotations(*depnodes) dep_parses = [] for sent in spndx.select(Sentence): dep_parse = DependencyParse(begin=sent.begin, end=sent.end) dep_nodes = [ n for n in spndx.select_covered( DependencyNode, dep_parse) ] for dep_node in dep_nodes: if not dep_parse.root and dep_node.is_root: # found the root dep_parse.root = dep_node dep_parses.append(dep_parse) spndx.add_annotations(*dep_parses) if annotation_layers & AnnotationLayers.ENTITY: spndx.add_annotations(*[ SpacyToSpandexUtils.convert_entity(e, window_span) for e in spacy_doc.ents ]) if annotation_layers & AnnotationLayers.NOUN_CHUNK: spndx.add_annotations(*[ SpacyToSpandexUtils.convert_noun_chunk(n, window_span) for n in spacy_doc.noun_chunks ])