예제 #1
0
def to_text(passage, sentences=True):
    """Converts from a Passage object to tokenized strings.

    Args:
        passage: the Passage object to convert
        sentences: whether to break the Passage to sentences (one for string)
        or leave as one string. Defaults to True

    Returns:
        a list of strings - 1 if sentences=False, # of sentences otherwise

    """
    tokens = [
        x.text for x in sorted(passage.layer(layer0.LAYER_ID).all,
                               key=lambda x: x.position)
    ]
    # break2sentences return the positions of the end tokens, which is
    # always the index into tokens incremented by ones (tokens index starts
    # with 0, positions with 1). So in essence, it returns the index to start
    # the next sentence from, and we should add index 0 for the first sentence
    if sentences:
        starts = [0] + util.break2sentences(passage)
    else:
        starts = [0, len(tokens)]
    return [
        ' '.join(tokens[starts[i]:starts[i + 1]])
        for i in range(len(starts) - 1)
    ]
예제 #2
0
    def test_break2sentences(self):
        """Tests identifying correctly sentence ends.

        Passage: [1 2 [3 P] H] . [[5 6 . P] H]
                 [[8 P] . 10 . H]

        """
        p = core.Passage('1')
        l0 = layer0.Layer0(p)
        l1 = layer1.Layer1(p)
        terms = [l0.add_terminal(str(i), False) for i in range(1, 4)]
        terms.append(l0.add_terminal('.', True))
        terms.append(l0.add_terminal('5', False))
        terms.append(l0.add_terminal('6', False))
        terms.append(l0.add_terminal('.', True))
        terms.append(l0.add_terminal('8', False, paragraph=2))
        terms.append(l0.add_terminal('.', True, paragraph=2))
        terms.append(l0.add_terminal('10', False, paragraph=2))
        terms.append(l0.add_terminal('.', True, paragraph=2))
        h1 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene)
        h2 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene)
        h3 = l1.add_fnode(None, layer1.EdgeTags.ParallelScene)
        p1 = l1.add_fnode(h1, layer1.EdgeTags.Process)
        p2 = l1.add_fnode(h2, layer1.EdgeTags.Process)
        p3 = l1.add_fnode(h3, layer1.EdgeTags.Process)
        h1.add(layer1.EdgeTags.Terminal, terms[0])
        h1.add(layer1.EdgeTags.Terminal, terms[1])
        p1.add(layer1.EdgeTags.Terminal, terms[2])
        l1.add_punct(None, terms[3])
        p2.add(layer1.EdgeTags.Terminal, terms[4])
        p2.add(layer1.EdgeTags.Terminal, terms[5])
        l1.add_punct(p2, terms[6])
        p3.add(layer1.EdgeTags.Terminal, terms[7])
        l1.add_punct(h3, terms[8])
        h3.add(layer1.EdgeTags.Terminal, terms[9])
        l1.add_punct(h3, terms[10])

        self.assertSequenceEqual(util.break2sentences(p), [4, 7, 11])
예제 #3
0
파일: convert.py 프로젝트: amitbeka/ucca
def to_text(passage, sentences=True):
    """Converts from a Passage object to tokenized strings.

    Args:
        passage: the Passage object to convert
        sentences: whether to break the Passage to sentences (one for string)
        or leave as one string. Defaults to True

    Returns:
        a list of strings - 1 if sentences=False, # of sentences otherwise

    """
    tokens = [x.text for x in sorted(passage.layer(layer0.LAYER_ID).all,
                                     key=lambda x: x.position)]
    # break2sentences return the positions of the end tokens, which is
    # always the index into tokens incremented by ones (tokens index starts
    # with 0, positions with 1). So in essence, it returns the index to start
    # the next sentence from, and we should add index 0 for the first sentence
    if sentences:
        starts = [0] + util.break2sentences(passage)
    else:
        starts = [0, len(tokens)]
    return [' '.join(tokens[starts[i]:starts[i + 1]])
            for i in range(len(starts) - 1)]