Пример #1
0
    def create_chunks(self, spndx: Spandex, subchunks: Iterable[Annotation],
                      outcomes: Iterable[str]):
        """
        attrib_func - function to parse tags and apply functions to output chunk type
        """
        chunks = []
        texts = [spndx.spanned_text(span) for span, _ in subchunks]
        prev_prefix = "O"
        prev_label = ""
        for text, subchunk, outcome in zip(texts, subchunks, outcomes):

            prefix, label = self.parse_outcome(outcome)

            if prefix != "O":
                if label != prev_label:
                    span = Span(begin=subchunk.begin, end=subchunk.end)
                    chunk = self.chunk_type()
                    self.attrib_func(chunk, outcome)
                    chunks.append((span, chunk))
                else:
                    chunk.end = subchunk.end
            else:
                chunk = None
            prev_prefix, prev_label = prefix, label

        spndx.add_layer(self.chunk_type, chunks)
Пример #2
0
    def process(self, spndx: Spandex, **kwargs):
        """
        Args:
            **kwargs: Keyword Arguments

        Keyword Args:
            annotation_layers (:obj:`AnnotationLayer`): Bitwise mask of AnnotationLayers
                indicating which layers to populate in Spandex.  Default value is
                AnnotationLayers.ALL()

            window_type (str or type): Class Type of object to run processing
                over.  A common use case would be to run on boundaries already
                defined prior to processing.  For example processing a document
                by subsection boundaries  Default of None means to process the
                full contents of the Spandex.
        """

        # FIXME, better in init or as kwargs?
        # window_type = kwargs.get('window_type', None)
        annotation_layers = kwargs.get('annotation_layers',
                                       AnnotationLayers.ALL())

        if not self.window_type:
            # process full document
            spacy_doc = self.spacy_pipeline(spndx.content_string)
            SpacyToSpandexUtils.spacy_to_spandex(spacy_doc, spndx,
                                                 annotation_layers)
        else:
            # process over windows
            for window in spndx.select(self.window_type):
                window_text = spndx.spanned_text(window)
                spacy_doc = self.spacy_pipeline(window_text)
                SpacyToSpandexUtils.spacy_to_spandex(spacy_doc, spndx,
                                                     annotation_layers, window)
Пример #3
0
 def __iter__(self):
     for uri in self.uris:
         spndx = Spandex()
         view = spndx.create_view(constants.SPANDEX_URI_VIEW,
                                  content_string=uri,
                                  content_mime="text/uri")
         yield spndx
Пример #4
0
def copy_view(spndx: Spandex, src_viewname: str, tgt_viewname: str):
    src_view = spndx.get_view(src_viewname)
    tgt_view = spndx.get_or_create_view(tgt_viewname)

    tgt_view.content_string = src_view.content_string
    tgt_view.content_mime = src_view.content_mime

    return tgt_view
Пример #5
0
def uri_to_spndx(uri, viewname=None):
    if not viewname:
        viewname = constants.SPANDEX_DEFAULT_VIEW
    url = urllib.request.urlparse(uri)
    fh = open(uri) if not url.scheme else urllib.request.urlopen(uri)

    spndx = Spandex()
    view = spndx.get_or_create_view(viewname)
    view.content_string = fh.read()
    view.content_mime = "text/plain"
    return spndx
Пример #6
0
    def process(self, spndx: Spandex, **kwargs):
        if self.window_type:
            windows = [
                window.span for window in spndx.select(self.window_type)
            ]
        else:
            windows = [Span(0, len(spndx.content_string))]

        annotations = []
        for window in windows:
            window_text = spndx.spanned_text(window)

            matches = list(self.split_re.finditer(window_text))

            if not matches:
                # no split found so make the whole window paragraph
                span = Span(begin=window.begin + 0,
                            end=window.begin + len(spndx.content_string))
                annotation = self.annotation_type(begin=span.begin,
                                                  end=span.end)
                annotations.append(annotation)
            else:
                if matches[0].span()[0] > 0:
                    span = Span(begin=window.begin + 0,
                                end=window.begin + matches[0].span()[0])
                    annotation = self.annotation_type(begin=span.begin,
                                                      end=span.end)
                    annotations.append(annotation)

                for m0, m1 in zip(matches[0:-1], matches[1:]):
                    span = Span(begin=window.begin + m0.span()[1],
                                end=window.begin + m1.span()[0])
                    annotation = self.annotation_type(begin=span.begin,
                                                      end=span.end)
                    annotations.append(annotation)

                if matches[-1].span()[1] <= len(window_text):
                    # get straggling span
                    span = Span(begin=window.begin + matches[-1].span()[1],
                                end=window.begin + len(spndx.content_string))
                    annotation = self.annotation_type(begin=span.begin,
                                                      end=span.end)
                    annotations.append(annotation)

        spndx.add_annotations(*annotations)
Пример #7
0
    def process(self, spndx: Spandex, **kwargs):
        if self.window_type:
            windows = [
                window.span for window in spndx.select(self.window_type)
            ]
        else:
            windows = [Span(0, len(spndx.content_string))]

        annotations = []
        for window in windows:
            window_text = spndx.spanned_text(window)

            for match in self.match_re.finditer(window_text):
                mbeg, mend = match.span()
                annotation = self.annotation_type()
                annotation.span = Span(begin=window.begin + mbeg,
                                       end=window.begin + mend)
                annotations.append(annotation)
        spndx.add_annotations(*annotations)
Пример #8
0
 def map_subchunks_to_outcome(self, spndx: Spandex,
                              chunks: Iterable[Annotation]):
     subchunk_outcomes = {}
     for chunk_span, chunk in chunks:
         for i, (subchunk_span, subchunk) in enumerate(
                 spndx.covered(self.subchunk_type, chunk_span)):
             tag = 'B' if i == 0 else 'I'
             suffix = self.suffix_func(chunk)
             subchunk_outcomes[(subchunk_span.begin,
                                subchunk_span.end)] = f"{tag}{suffix}"
     return subchunk_outcomes
Пример #9
0
    def spacy_to_spandex(spacy_doc,
                         spndx=None,
                         annotation_layers=AnnotationLayers.ALL(),
                         window_span=None):

        if not spndx:
            spndx = Spandex(spacy_doc.text_with_ws)

        if annotation_layers & AnnotationLayers.DOCUMENT:
            if window_span:
                doc = Document(begin=window_span.begin, end=window_span.end)
            else:
                doc_span = Span(0, len(spndx.content_string))
                doc = Document(begin=doc_span.begin, end=doc_span.end)

            spndx.add_annotations(doc)

        if annotation_layers & AnnotationLayers.SENTENCE:
            spndx.add_annotations(*[
                SpacyToSpandexUtils.convert_sentence(s, window_span)
                for s in spacy_doc.sents
            ])

        # Extract tokens and dependency parse
        spacy_toks = [t for t in spacy_doc]
        if annotation_layers & AnnotationLayers.TOKEN:
            all_toks = [
                SpacyToSpandexUtils.convert_token(t, window_span)
                for t in spacy_toks
            ]
            word_toks = [(tok, spacy_tok)
                         for (tok, spacy_tok) in zip(all_toks, spacy_toks)
                         if not spacy_tok.is_space]
            toks = [tok for (tok, spacy_tok) in word_toks]
            spndx.add_annotations(*toks)

            if annotation_layers & AnnotationLayers.DEPPARSE:
                # Pull out dependency graphs
                span_to_nodes = {
                    tok.span: DependencyNode(begin=tok.begin, end=tok.end)
                    for tok in toks
                }

                depedges = []
                depnodes = []
                depnode_spans = set()
                for (tok, spacy_tok) in word_toks:
                    headtok = all_toks[spacy_tok.head.i]
                    head_node = span_to_nodes[headtok.span]
                    child_span = tok.span
                    child_node = span_to_nodes[child_span]

                    # get span for full dependency
                    depspan = Span(begin=min(tok.begin, headtok.begin),
                                   end=max(tok.end, headtok.end))
                    # Build edges
                    depedge = DependencyEdge(label=spacy_tok.dep_,
                                             head=head_node,
                                             child=child_node)
                    depedge.span = depspan
                    child_node.head_edge = depedge
                    head_node.child_edges.append(depedge)
                    if headtok.span not in depnode_spans:
                        depnodes.append(head_node)
                        depnode_spans.add(head_node.span)

                    if child_span not in depnode_spans:
                        depnodes.append(child_node)
                        depnode_spans.add(child_span)
                    depedges.append(depedge)
                # push dependency graph onto spandex
                spndx.add_annotations(*depedges)
                spndx.add_annotations(*depnodes)

                dep_parses = []
                for sent in spndx.select(Sentence):
                    dep_parse = DependencyParse(begin=sent.begin, end=sent.end)
                    dep_nodes = [
                        n for n in spndx.select_covered(
                            DependencyNode, dep_parse)
                    ]
                    for dep_node in dep_nodes:
                        if not dep_parse.root and dep_node.is_root:
                            # found the root
                            dep_parse.root = dep_node
                    dep_parses.append(dep_parse)

                spndx.add_annotations(*dep_parses)

        if annotation_layers & AnnotationLayers.ENTITY:
            spndx.add_annotations(*[
                SpacyToSpandexUtils.convert_entity(e, window_span)
                for e in spacy_doc.ents
            ])

        if annotation_layers & AnnotationLayers.NOUN_CHUNK:
            spndx.add_annotations(*[
                SpacyToSpandexUtils.convert_noun_chunk(n, window_span)
                for n in spacy_doc.noun_chunks
            ])