示例#1
0
def test_combine_sentence_tex_following_latex_linebreak_conventions():
    extractor = SentenceExtractor(from_named_sections_only=False)
    sentences = list(
        extractor.parse(
            "main.tex",
            "\n".join([
                # All on one line.
                (r"This is the first sentence.\\" +
                 r"This is the second sentence.\linebreak " +
                 "This is the third sentence."),
                "",
                "This is the fourth sentence.",
                " ",
                # With only a single newline, consecutive lines should be considered the same sentence.
                "This is the fifth sentence, which is written on multiple",
                "lines.",
            ]),
        ))
    assert sentences[0].text == "This is the first sentence."
    assert sentences[0].end == 27
    assert sentences[1].text == "This is the second sentence."
    assert sentences[2].text == "This is the third sentence."
    assert sentences[3].text == "This is the fourth sentence."
    assert (sentences[4].text ==
            "This is the fifth sentence, which is written on multiple lines.")
    assert len(sentences) == 5
示例#2
0
def test_ignore_periods_in_equations():
    extractor = SentenceExtractor(from_named_sections_only=False)
    sentences = list(
        extractor.parse("main.tex", "This sentence has an $ equation. In $ the middle.")
    )
    assert len(sentences) == 1
    assert sentences[0].text == "This sentence has an <<equation-0>> the middle."
示例#3
0
def test_ignore_periods_in_equations():
    extractor = SentenceExtractor()
    sentences = list(
        extractor.parse("main.tex",
                        "This sentence has an $ equation. In $ the middle."))
    assert len(sentences) == 1
    assert sentences[0].text == "This sentence has an [[math]] the middle."
示例#4
0
def test_sentence_includes_preceding_equation():
    extractor = SentenceExtractor(from_named_sections_only=False)
    # This was a specific case seen in an example paper.
    sentences = list(
        extractor.parse("main.tex",
                        "Sentence 1.\n\\[x\\]\nstarts the sentence."))
    assert sentences[1].tex == "\\[x\\]\nstarts the sentence."
    assert sentences[1].start == 12
    assert sentences[1].end == 38
示例#5
0
def test_ignore_periods_in_equations():
    extractor = SentenceExtractor(from_named_sections_only=False)
    sentences = list(
        extractor.parse("main.tex",
                        "This sentence has an $ equation. In $ the middle."))
    assert len(sentences) == 1
    assert sentences[0].text == (
        "This sentence has an EQUATION_DEPTH_0_START equation. In EQUATION_DEPTH_0_END "
        + "the middle.")
示例#6
0
def test_sentence_from_within_command():
    extractor = SentenceExtractor(from_named_sections_only=False)
    # The space in the start of this sentence is important, as in an earlier version of the
    # code, this space caused extraction to fail.
    sentences = list(extractor.parse("main.tex", r" \textit{Example sentence.}"))
    assert sentences[0].text == "Example sentence."
    assert sentences[0].tex == "Example sentence."
    assert sentences[0].start == 9
    assert sentences[0].end == 26
示例#7
0
def test_sentence_splitting_end_points():
    extractor = SentenceExtractor(from_named_sections_only=False)
    sentences = list(
        extractor.parse(
            "main.tex",
            "This is a sentence. Next we describe two items. 1) The first item. 2) The second item.",
        ))

    assert len(sentences) == 4
    sentence_end_points = [[0, 19], [20, 47], [48, 66], [67, 86]]
    for i, [start, end] in enumerate(sentence_end_points):
        assert sentences[i].start == start
        assert sentences[i].end == end
示例#8
0
def test_sentence_splitting_end_points_and_more_text():
    extractor = SentenceExtractor()
    sentences = list(
        extractor.parse(
            "main.tex",
            "This sentence. has extra. text. 1. first 2. second 3. third. And some extra. stuff.",
        ))
    assert len(sentences) == 8
    sentence_end_points = [[0, 14], [15, 25], [26, 31], [32, 40], [41, 50],
                           [51, 60], [61, 76], [77, 83]]
    for i, [start, end] in enumerate(sentence_end_points):
        assert sentences[i].start == start
        assert sentences[i].end == end
示例#9
0
def test_extract_sentences():
    extractor = SentenceExtractor(from_named_sections_only=False)
    sentences = list(
        extractor.parse(
            "main.tex",
            "This is the first \\macro[arg]{sentence}. This is the second sentence.",
        ))
    assert len(sentences) == 2

    sentence1 = sentences[0]
    assert sentence1.start == 0
    assert sentence1.end == 40
    assert sentence1.text == "This is the first argsentence."

    sentence2 = sentences[1]
    assert sentence2.start == 41
    assert sentence2.end == 69
    assert sentence2.text == "This is the second sentence."