예제 #1
0
def _postprocess(tree, start=0, edu_start=1):
    """
    Helper function: Convert the NLTK-parsed representation of an RST tree
    to one using educe-style Standoff objects
    """
    if isinstance(tree, Tree):
        children = []
        position = start - 1  # compensate for virtual whitespace added below
        node = _parse_node(treenode(tree), Span(-1, -1))
        edu_start2 = node.edu_span[0]

        for child_ in tree:
            # (NB: +1 to add virtual whitespace between EDUs)
            child = _postprocess(child_, position + 1, edu_start2)
            children.append(child)
            # pylint: disable=E1101
            child_sp = _tree_span(child)
            # pylint: enable=E1101
            position = child_sp.char_end

        node.span = Span(start, position)
        return RSTTree(node, children)
    else:
        if tree.startswith("["):
            return _parse_edu(tree[1:-1], edu_start, start)
        else:
            raise RSTTreeException("ERROR in rst tree format for leaf : ",
                                   child)
예제 #2
0
    def walk(subtree, posinfo=PosInfo(text=0, edu=0)):
        """
        walk down first-cut tree, counting span info and returning a
        fancier tree along the way
        """
        if isinstance(subtree, Tree):
            start = copy.copy(posinfo)
            children = []
            nuc_kids = []
            for kid in subtree:
                tree, posinfo, nuc_kid = walk(kid, posinfo)
                children.append(tree)
                nuc_kids.append(nuc_kid)
            nuclearity = ''.join(x for x in nuc_kids)

            match = _lw_type_re.match(treenode(subtree))
            if not match:
                raise RSTTreeException("Missing nuclearity annotation in " +
                                       str(subtree))
            nuc = match.group("nuc")
            rel = match.group("rel") or "leaf"
            edu_span = (start.edu, posinfo.edu - 1)
            span = Span(start.text, posinfo.text)
            node = Node(nuclearity, edu_span, span, rel)
            return SimpleRSTTree(node, children), posinfo, nuc
        else:
            text = subtree
            start = posinfo.text
            end = start + len(text)
            posinfo2 = PosInfo(text=end, edu=posinfo.edu + 1)
            return EDU(posinfo.edu, Span(start, end), text), posinfo2, "leaf"
예제 #3
0
 def assertOverlap(self, expected, pair1, pair2, **kwargs):
     "true if `pair1.overlaps(pair2) == expected` (modulo boxing)"
     (x1, y1) = pair1
     (x2, y2) = pair2
     (rx, ry) = expected
     o = Span(x1, y1).overlaps(Span(x2, y2), **kwargs)
     self.assertTrue(o)
     self.assertEqual(Span(rx, ry), o)
예제 #4
0
파일: doc.py 프로젝트: fbuijs/educe
def split_doc(doc, middle):
    """
    Given a split point, break a document into two pieces.
    If the split point is None, we take the whole document
    (this is slightly different from having -1 as a split
    point)

    Raise an exception if there are any annotations that span the point.

    Parameters
    ----------
    doc : Document
        The document we want to split.
    middle : int
        Split point.

    Returns
    -------
    doc_prefix : Document
        Deep copy of `doc` restricted to span [:middle]
    doc_suffix : Document
        Deep copy of `doc` restricted to span [middle:] ; the span of each
        annotation is shifted to match the new text.
    """
    doc_len = doc.text_span().char_end
    if middle < 0:
        middle = doc_len + 1 + middle

    def straddles(point, span):
        """
        True if the point is somewhere in the middle of the span
        (sitting at right edge doesn't count).

        Note that this is not the same as checking for enclosure
        because we do not include the rightward edge
        """
        if span is None:
            return False
        return span.char_start < point and span.char_end > point

    leftovers = [
        x for x in doc.annotations() if straddles(middle, x.text_span())
    ]

    if leftovers:
        oops = ("Can't split document [{origin}] at {middle} because it is "
                "straddled by the following annotations:\n"
                "{annotations}\n"
                "Either split at a different place or remove the annotations")
        leftovers = [' * %s %s' % (x.text_span(), x) for x in leftovers]
        raise StacDocException(
            oops.format(origin=doc.origin,
                        middle=middle,
                        annotations='\n'.join(leftovers)))

    prefix = Span(0, middle)
    suffix = Span(middle, doc_len)
    return narrow_to_span(doc, prefix), narrow_to_span(doc, suffix)
예제 #5
0
    def test_simple_align(self):
        "trivial token realignment"

        tokens = ["a", "bb", "ccc"]
        text = "a bb    ccc"
        spans = list(generic_token_spans(text, tokens))
        expected = [Span(0, 1),
                    Span(2, 4),
                    Span(8, 11)]
        self.assertEquals(expected, spans)
예제 #6
0
    def test_messy_align(self):
        "ignore whitespace in token"

        tokens = ["a", "b b", "c c c"]
        text = "a bb    ccc"
        spans = list(generic_token_spans(text, tokens))
        expected = [Span(0, 1),
                    Span(2, 4),
                    Span(8, 11)]
        self.assertEquals(expected, spans)
예제 #7
0
def _actually_split(tcache, doc, dialogue, turn):
    """Split the dialogue before the given turn.
    """
    dspan = dialogue.text_span()
    tspan = turn.text_span()
    span1 = Span(dspan.char_start, tspan.char_start - 1)
    span2 = Span(tspan.char_start - 1, dspan.char_end)
    dialogue1 = dialogue
    dialogue2 = copy.deepcopy(dialogue)
    _set(tcache, span1, dialogue1)
    _set(tcache, span2, dialogue2)
    doc.units.append(dialogue2)
    dialogue2.features = {}
예제 #8
0
def generic_token_spans(text, tokens, offset=0, txtfn=None):
    """
    Given a string and a sequence of substrings within than string,
    infer a span for each of the substrings.

    We do this spans by walking the text and the tokens we consume
    substrings and skipping over any whitespace (including that
    which is within the tokens). For this to work, the substring
    sequence must be identical to the text modulo whitespace.

    Spans are relative to the start of the string itself, but can be
    shifted by passing an offset (the start of the original string's
    span). Empty tokens are accepted but have a zero-length span.

    Note: this function is lazy so you can use it incrementally
    provided you can generate the tokens lazily too

    You probably want `token_spans` instead; this function is meant
    to be used for similar tasks outside of pos tagging

    :param txtfn: function to extract text from a token (default None,
                  treated as identity function)
    """
    txt_iter = ifilterfalse(lambda x: x[1].isspace(), enumerate(text))
    txtfn = txtfn or (lambda x: x)
    last = offset  # for corner case of empty tokens
    for token in tokens:
        tok_chars = list(ifilterfalse(lambda x: x.isspace(), txtfn(token)))
        if not tok_chars:
            yield Span(last, last)
            continue
        prefix = list(islice(txt_iter, len(tok_chars)))
        if not prefix:
            msg = "Too many tokens (current: %s)" % txtfn(token)
            raise EducePosTagException(msg)
        last = prefix[-1][0] + 1 + offset
        span = Span(prefix[0][0] + offset, last)
        pretty_prefix = text[span.char_start:span.char_end]
        # check the text prefix to make sure we have the same
        # non-whitespace characters
        for txt_pair, tok_char in zip(prefix, tok_chars):
            idx, txt_char = txt_pair
            if txt_char != tok_char:
                msg = "token mismatch at char %d (%s vs %s)\n"\
                    % (idx, txt_char, tok_char)\
                    + " token: [%s]\n" % token\
                    + " text:  [%s]" % pretty_prefix
                raise EducePosTagException(msg)
        yield span
예제 #9
0
def _load_rst_wsj_corpus_text_file_wsj(f):
    """Actually do load"""
    text = f.read()

    start = 0
    sent_id = 0
    output_sents = []
    output_paras = []
    for para_id, paragraph in enumerate(text.split(WSJ_SEP_PARA)):
        para_sents = []
        for sentence in paragraph.split(WSJ_SEP_SENT):
            end = start + len(sentence)
            # NEW: remove trailing white space
            rws = len(sentence) - len(sentence.rstrip())
            if rws:
                end -= rws
            # end NEW
            if end > start:
                para_sents.append(Sentence(sent_id, Span(start, end)))
                sent_id += 1
            start = end + rws + 1  # + 1 for + len(WSJ_SEP_SENT)
        output_paras.append(Paragraph(para_id, para_sents))
        output_sents.extend(para_sents)
        start += 2  # whitespace and second newline

    return text, output_sents, output_paras
예제 #10
0
def shift_span(span, updates, stretch_right=False):
    """
    Given a span and an updates tuple, return a Span
    that is shifted over to reflect the updates

    Parameters
    ----------
    span: Span
    updates: Updates
    stretch_right : boolean, optional
        If True, stretch the right boundary of an annotation that buts up
        against the left of a new annotation. This is recommended for
        annotations that should fully cover a given span, like dialogues
        for documents.

    Returns
    -------
    span: Span

    See also
    --------
    shift_char: for details on how this works
    """
    start = shift_char(span.char_start, updates)
    if stretch_right:
        end = shift_char(span.char_end, updates)
    else:
        # this is to avoid spurious overstretching of the right
        # boundary of an annotation that buts up against the
        # left of a new annotation
        end = 1 + shift_char(span.char_end - 1, updates)
    return Span(start, end)
예제 #11
0
def _dialogues_in_turns(corpus, turn1, turn2):
    """
    Given a pair of turns
    """

    # grab a document from the set (assumption here is that
    # they are all morally the same doc)
    if not corpus.values():
        sys.exit("No documents selected")
    doc = corpus.values()[0]

    starting_turn = get_turn(turn1, doc)
    ending_turn = get_turn(turn2, doc)

    # there's a bit of fuzz for whitespace before/after the
    # turns
    span = Span(starting_turn.text_span().char_start - 1,
                ending_turn.text_span().char_end + 1)

    def is_in_range(anno):
        """
        If the annotation is a dialogue that is covered by the
        turns in question
        """
        return is_dialogue(anno) and span.encloses(anno.span)

    return [
        anno_id_to_tuple(x.local_id()) for x in doc.annotations()
        if is_in_range(x)
    ]
예제 #12
0
class Token(RawToken, Standoff):
    """
    A token with a part of speech tag and some character offsets
    associated with it.
    """
    def __init__(self, tok, span):
        RawToken.__init__(self, tok.word, tok.tag)
        Standoff.__init__(self)
        self.span = span

    def __str__(self):
        return '%s\t%s' % (RawToken.__str__(self), self.span)

    def __unicode__(self):
        return '%s\t%s' % (RawToken.__unicode__(self), self.span)

    # left padding Token
    _lpad_word = '__START__'
    _lpad_tag = '__START__'
    _lpad_span = Span(0, 0)

    @classmethod
    def left_padding(cls):
        "Return a special Token for left padding"
        return Token(RawToken(cls._lpad_word, cls._lpad_tag), cls._lpad_span)
예제 #13
0
def token_spans(text, tokens, offset=0):
    """
    Given a string and a sequence of RawToken representing tokens
    in that string, infer the span for each token.  Return the
    results as a sequence of Token objects.

    We infer these spans by walking the text as we consume tokens,
    and skipping over any whitespace in between. For this to work,
    the raw token text must be identical to the text modulo whitespace.

    Spans are relative to the start of the string itself, but can be
    shifted by passing an offset (the start of the original string's
    span)
    """
    token_words = [tok.word for tok in tokens]
    spans = generic_token_spans(text, token_words, offset)
    res = [Token(tok, span) for tok, span in zip(tokens, spans)]

    # sanity checks that should be moved to tests
    for orig_tok, new_tok in zip(tokens, res):
        span = Span(new_tok.span.char_start - offset,
                    new_tok.span.char_end - offset)
        snippet = text[span.char_start:span.char_end]
        assert snippet == new_tok.word
        assert orig_tok.word == new_tok.word
        assert orig_tok.tag == new_tok.tag
    return res
예제 #14
0
def approximate_cover(elts, tgt):
    """Returns True if elts covers tgt's span.

    This is approximate because we only check that:
    * the first and last elements respectively begin and end at the
      extremities of tgt.span,
    * consecutive elements don't overlap.

    Because of the second item, we assume that elts has been sorted
    by span.

    Parameters
    ----------
    elts : sorted list of Annotation
        Sequence of elements
    tgt : Annotation
        Target annotation

    Returns
    -------
    res : boolean
        True if elts approximately cover tgt.span
    """
    span_seq = Span(elts[0].span.char_start, elts[-1].span.char_end)
    res = (span_eq(span_seq, tgt.text_span(), eps=1) and all(
        elt_cur.overlaps(elt_nxt) is None
        for elt_cur, elt_nxt in zip(elts[:-1], elts[1:])))
    return res
예제 #15
0
def _mk_token(ttoken, span):
    """
    Convert a tweaked token and the span it's been aligned with
    into a proper Token object.
    """
    if ttoken.offset != 0:
        span = Span(span.char_start + ttoken.offset, span.char_end)
    return Token(ttoken, span)
예제 #16
0
파일: nudge.py 프로젝트: moreymat/educe
    def anno(doc, prefix, tspan):
        "pad text segment as needed"

        prefix_t = "..."\
            if tspan.char_start + len(prefix) < info.span.char_start\
            else ""
        myspan = Span(info.span.char_start, tspan.char_end)
        return "".join([prefix, prefix_t, annotate_doc(doc, span=myspan)])
예제 #17
0
파일: weave.py 프로젝트: tjane/educe
def compute_updates(src_doc, tgt_doc, matches):
    """Return updates that would need to be made on the target
    document.

    Given matches between the source and target document, return span
    updates along with any source annotations that do not have an
    equivalent in the target document (the latter may indicate that
    resegmentation has taken place, or that there is some kind of problem)

    Parameters
    ----------
    src_doc : Document
    tgt_doc : Document
    matches : [Match]

    Returns
    -------
    updates: Updates
    """
    res = Updates()

    # case 2 and 5 (to be pruned below)
    res.expected_src_only.extend(src_doc.units)
    res.abnormal_tgt_only.extend(tgt_doc.units)

    # case 1, 2 and 4
    for src, tgt, size in matches:
        tgt_to_src = src - tgt
        res.shift_if_ge[tgt] = tgt_to_src  # case 1 and 2
        src_annos = enclosed(Span(src, src + size), src_doc.units)
        tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units)
        for src_anno in src_annos:
            res.expected_src_only.remove(src_anno)  # prune from case 5
            src_span = src_anno.text_span()
            tgt_equiv = [
                x for x in tgt_annos
                if x.text_span().shift(tgt_to_src) == src_span
            ]
            if not tgt_equiv:  # case 4
                res.abnormal_src_only.append(src_anno)
            for tgt_anno in tgt_equiv:  # prun from case 2
                if tgt_anno in res.abnormal_tgt_only:
                    res.abnormal_tgt_only.remove(tgt_anno)

    return res
예제 #18
0
 def __init__(self, node, children, origin=None):
     SearchableTree.__init__(self, node, children)
     Standoff.__init__(self, origin)
     if not children:
         raise Exception("Can't create a tree with no children")
     self.children = children
     start = min(x.span.char_start for x in children)
     end = max(x.span.char_end for x in children)
     self.span = Span(start, end)
예제 #19
0
 def __init__(self, t, offset, origin=None):
     extent = t['extent']
     word = t['word']
     tag = t['POS']
     span = Span(extent[0], extent[1] + 1).shift(offset)
     postag.Token.__init__(self, postag.RawToken(word, tag), span)
     self.features = copy.copy(t)
     for k in ['s_id', 'word', 'extent', 'POS']:
         del self.features[k]
예제 #20
0
 def __init__(self, node, children, link, origin=None):
     SearchableTree.__init__(self, node, children)
     Standoff.__init__(self, origin)
     nodes = children
     if not self.is_root():
         nodes.append(self.label())
     start = min(x.span.char_start for x in nodes)
     end = max(x.span.char_end for x in nodes)
     self.link = link
     self.span = Span(start, end)
     self.origin = origin
예제 #21
0
def main(args):
    """Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`.
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    # locate insertion site: target document
    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    # TODO mark units with FIXME, optionally delete in/out relations
    span = args.span
    sub_text = args.sub_text
    minor = args.minor
    # store before/after
    annos_before = []
    annos_after = []
    for tgt_k, tgt_doc in tgt_corpus.items():
        annos_before.append(annotate_doc(tgt_doc, span=span))
        # process
        new_tgt_doc = replace_text_at_span(tgt_doc,
                                           span,
                                           sub_text,
                                           minor=minor)
        # WIP new_span, depends on the offset
        offset = len(sub_text) - (span.char_end - span.char_start)
        new_span = Span(span.char_start, span.char_end + offset)
        # end WIP
        annos_after.append(annotate_doc(new_tgt_doc, span=new_span))
        # show diff and save doc
        diffs = [
            "======= REPLACE TEXT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)
    announce_output_dir(output_dir)
    # commit message
    tgt_k, tgt_doc = list(tgt_corpus.items())[0]
    anno_str_before = annos_before[0]
    anno_str_after = annos_after[0]
    if tgt_k and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(tgt_k, anno_str_before, anno_str_after))
예제 #22
0
    def shift_anno(anno, offset, point):
        """Get a shifted copy of an annotation"""
        anno2 = copy.deepcopy(anno)
        if not isinstance(anno, Unit):
            return anno2

        anno_span = anno2.text_span()
        if anno_span.char_start >= point:
            # if the annotation is entirely after the deletion site,
            # shift the whole span
            anno2.span = anno_span.shift(offset)
        elif anno_span.char_end >= point:
            # if the annotation straddles the substitution site,
            # stretch (shift its end)
            anno2.span = Span(anno_span.char_start,
                              anno_span.char_end + offset)
        return anno2
예제 #23
0
파일: corenlp.py 프로젝트: moreymat/educe
 def __init__(self, t, offset, origin=None):
     """
     Parameters
     ----------
     t : dict
         Token from corenlp's XML output.
     offset : int
         Offset from the span of the corenlp token to the document.
     origin : FileId, optional
         Identifier for the document.
     """
     extent = t['extent']
     word = t['word']
     tag = t['POS']
     span = Span(extent[0], extent[1] + 1).shift(offset)
     postag.Token.__init__(self, postag.RawToken(word, tag), span)
     self.features = copy.copy(t)
     for k in ['s_id', 'word', 'extent', 'POS']:
         del self.features[k]
예제 #24
0
    def shift_anno(anno, span_old, text_new):
        """Get a shifted copy of an annotation"""
        anno2 = copy.deepcopy(anno)
        if not isinstance(anno, Unit):
            return anno2

        offset = (len(text_new) - (span_old.char_end - span_old.char_start))

        anno_span = anno2.text_span()
        if anno_span.char_start >= span_old.char_end:
            # if the annotation is entirely after the substitution site,
            # shift the whole span
            anno2.span = anno_span.shift(offset)
        elif anno_span.char_end >= span_old.char_end:
            # if the annotation straddles the substitution site,
            # stretch (shift its end)
            anno2.span = Span(anno_span.char_start,
                              anno_span.char_end + offset)
        return anno2
예제 #25
0
def _parse_edu(descr, edu_start, start=0):
    """
    Parse an RST DT leaf string
    """
    sdesc = descr.strip()
    if sdesc.startswith("<s>"):
        sdesc = sdesc[3:]
    if sdesc.endswith("</s>"):
        sdesc = sdesc[:-4]

    if sdesc.startswith("<EDU>") and sdesc.endswith("</EDU>"):
        text = sdesc[5:-6]  # remove <EDU></EDU> mark
    elif sdesc.startswith("_!") and sdesc.endswith("_!"):
        text = sdesc[2:-2]
    else:
        text = sdesc

    end = start + len(text)
    span = Span(start, end)  # text-span (not the same as EDU span)
    return EDU(edu_start, span, text)
예제 #26
0
def syntactic_node_seq(ptree, tokens):
    """Find the sequence of syntactic nodes covering a sequence of tokens.

    Parameters
    ----------
    ptree: `nltk.tree.Tree`
        Syntactic tree.
    tokens: sequence of `Token`
        Sequence of tokens under scrutiny.

    Returns
    -------
    syn_nodes: list of `nltk.tree.Tree`
        Spanning sequence of nodes of the syntactic tree.
    """
    txt_span = Span(tokens[0].text_span().char_start,
                    tokens[-1].text_span().char_end)

    for node in ptree.subtrees(lambda t: t.text_span().encloses(txt_span)):
        # * spanning node
        if node.text_span() == txt_span:
            return [node]

        # * otherwise: spanning subsequence of kid nodes
        txt_span_start = txt_span.char_start
        txt_span_end = txt_span.char_end
        kids_start = [x.text_span().char_start for x in node]
        kids_end = [x.text_span().char_end for x in node]
        try:
            idx_left = kids_start.index(txt_span_start)
        except ValueError:
            continue
        try:
            idx_right = kids_end.index(txt_span_end)
        except ValueError:
            continue
        if idx_left == idx_right:
            continue
        return [x for x in node[idx_left:idx_right + 1]]
    else:
        return []
예제 #27
0
파일: features.py 프로젝트: fbuijs/educe
    def fill(self, current, edu1, edu2, target=None):
        vec = self if target is None else target
        doc = current.doc
        big_span = edu1.text_span().merge(edu2.text_span())

        # spans for the turns that come between the two edus
        turns_between_span = Span(edu1.turn.text_span().char_end,
                                  edu2.turn.text_span().char_start)
        turns_between = turns_in_span(doc, turns_between_span)

        inner_edus = edus_in_span(doc, big_span)
        if edu1.identifier() != ROOT:  # not present anyway
            inner_edus.remove(edu1)
        if edu2.identifier() != ROOT:
            inner_edus.remove(edu2)

        gap = EduGap(inner_edus=inner_edus,
                     turns_between=turns_between,
                     sf_cache=self.sf_cache)

        for key in self.keys:
            vec[key.name] = key.function(current, gap, edu1, edu2)
예제 #28
0
class Sentence(Standoff):
    """
    Just a text span really
    """
    def __init__(self, num, span):
        super(Sentence, self).__init__()
        self.span = span

        self.num = num
        "sentence ID in document"

    def text_span(self):
        return self.span

    # left padding
    _lpad_num = -1
    _lpad_span = Span(0, 0)

    @classmethod
    def left_padding(cls):
        """Return a left padding Sentence"""
        return cls(cls._lpad_num, cls._lpad_span)
예제 #29
0
파일: nudge.py 프로젝트: moreymat/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    _screen_args(args)
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)

    old_span = args.span
    new_span = Span(old_span.char_start + args.nudge_start,
                    old_span.char_end + args.nudge_end)
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        found = False
        for anno in new_doc.units:
            if anno.span == old_span:
                anno.span = copy.deepcopy(new_span)
                found = True
        if found:
            diffs = _mini_diff(k, (old_doc, old_span), (new_doc, new_span))
            print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        else:
            print("WARNING: No annotations found for %s in %s" % (old_span, k),
                  file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        span = old_span.merge(new_span)
        commit_info = CommitInfo(key=k,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
예제 #30
0
def _load_rst_wsj_corpus_text_file_file(f):
    """Actually do load"""
    text = f.read()

    start = 0
    sent_id = 0
    output_sents = []
    output_paras = None  # paragraph marking are missing from these documents

    for sentence in text.split(FIL_SEP_SENT):
        end = start + len(sentence)
        # NEW: remove leading white spaces
        lws = len(sentence) - len(sentence.lstrip())
        if lws:
            start += lws
        # end NEW
        if end > start:
            output_sents.append(Sentence(sent_id, Span(start, end)))
            sent_id += 1
        start = end + 3  # + 3 for + len(FIL_SEP_SENT)
    # TODO remove trailing '\n' of last sentence

    return text, output_sents, output_paras