def test_combine_sentence_tex_following_latex_linebreak_conventions(): extractor = SentenceExtractor(from_named_sections_only=False) sentences = list( extractor.parse( "main.tex", "\n".join([ # All on one line. (r"This is the first sentence.\\" + r"This is the second sentence.\linebreak " + "This is the third sentence."), "", "This is the fourth sentence.", " ", # With only a single newline, consecutive lines should be considered the same sentence. "This is the fifth sentence, which is written on multiple", "lines.", ]), )) assert sentences[0].text == "This is the first sentence." assert sentences[0].end == 27 assert sentences[1].text == "This is the second sentence." assert sentences[2].text == "This is the third sentence." assert sentences[3].text == "This is the fourth sentence." assert (sentences[4].text == "This is the fifth sentence, which is written on multiple lines.") assert len(sentences) == 5
def test_ignore_periods_in_equations(): extractor = SentenceExtractor(from_named_sections_only=False) sentences = list( extractor.parse("main.tex", "This sentence has an $ equation. In $ the middle.") ) assert len(sentences) == 1 assert sentences[0].text == "This sentence has an <<equation-0>> the middle."
def test_ignore_periods_in_equations(): extractor = SentenceExtractor() sentences = list( extractor.parse("main.tex", "This sentence has an $ equation. In $ the middle.")) assert len(sentences) == 1 assert sentences[0].text == "This sentence has an [[math]] the middle."
def test_sentence_includes_preceding_equation(): extractor = SentenceExtractor(from_named_sections_only=False) # This was a specific case seen in an example paper. sentences = list( extractor.parse("main.tex", "Sentence 1.\n\\[x\\]\nstarts the sentence.")) assert sentences[1].tex == "\\[x\\]\nstarts the sentence." assert sentences[1].start == 12 assert sentences[1].end == 38
def test_ignore_periods_in_equations(): extractor = SentenceExtractor(from_named_sections_only=False) sentences = list( extractor.parse("main.tex", "This sentence has an $ equation. In $ the middle.")) assert len(sentences) == 1 assert sentences[0].text == ( "This sentence has an EQUATION_DEPTH_0_START equation. In EQUATION_DEPTH_0_END " + "the middle.")
def test_sentence_from_within_command(): extractor = SentenceExtractor(from_named_sections_only=False) # The space in the start of this sentence is important, as in an earlier version of the # code, this space caused extraction to fail. sentences = list(extractor.parse("main.tex", r" \textit{Example sentence.}")) assert sentences[0].text == "Example sentence." assert sentences[0].tex == "Example sentence." assert sentences[0].start == 9 assert sentences[0].end == 26
def test_sentence_splitting_end_points(): extractor = SentenceExtractor(from_named_sections_only=False) sentences = list( extractor.parse( "main.tex", "This is a sentence. Next we describe two items. 1) The first item. 2) The second item.", )) assert len(sentences) == 4 sentence_end_points = [[0, 19], [20, 47], [48, 66], [67, 86]] for i, [start, end] in enumerate(sentence_end_points): assert sentences[i].start == start assert sentences[i].end == end
def test_sentence_splitting_end_points_and_more_text(): extractor = SentenceExtractor() sentences = list( extractor.parse( "main.tex", "This sentence. has extra. text. 1. first 2. second 3. third. And some extra. stuff.", )) assert len(sentences) == 8 sentence_end_points = [[0, 14], [15, 25], [26, 31], [32, 40], [41, 50], [51, 60], [61, 76], [77, 83]] for i, [start, end] in enumerate(sentence_end_points): assert sentences[i].start == start assert sentences[i].end == end
def test_extract_sentences(): extractor = SentenceExtractor(from_named_sections_only=False) sentences = list( extractor.parse( "main.tex", "This is the first \\macro[arg]{sentence}. This is the second sentence.", )) assert len(sentences) == 2 sentence1 = sentences[0] assert sentence1.start == 0 assert sentence1.end == 40 assert sentence1.text == "This is the first argsentence." sentence2 = sentences[1] assert sentence2.start == 41 assert sentence2.end == 69 assert sentence2.text == "This is the second sentence."