def test_visual_linker_not_affected_by_order_of_sentences(): """Test if visual_linker result is not affected by the order of sentences.""" docs_path = "tests/data/html/2N6427.html" pdf_path = "tests/data/pdf/2N6427.pdf" # Initialize preprocessor, parser, visual_linker. # Note that parser is initialized with `visual=False` and that visual_linker # will be used to attach "visual" information to sentences after parsing. preprocessor = HTMLDocPreprocessor(docs_path) parser_udf = get_parser_udf(structural=True, lingual=False, tabular=True, visual=False) visual_linker = VisualLinker(pdf_path=pdf_path) doc = parser_udf.apply(next(preprocessor.__iter__())) # Sort sentences by sentence.position doc.sentences = sorted(doc.sentences, key=attrgetter("position")) sentences0 = [ sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path) ] # Sort again in case visual_linker.link changes the order sentences0 = sorted(sentences0, key=attrgetter("position")) doc = parser_udf.apply(next(preprocessor.__iter__())) # Shuffle random.shuffle(doc.sentences) sentences1 = [ sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path) ] # Sort sentences by sentence.position sentences1 = sorted(sentences1, key=attrgetter("position")) # This should hold as both sentences are sorted by their position assert all([ sent0.position == sent1.position for (sent0, sent1) in zip(sentences0, sentences1) ]) # The following assertion should hold if the visual_linker result is not affected # by the order of sentences. assert all([ sent0.left == sent1.left for (sent0, sent1) in zip(sentences0, sentences1) ])
def parse_doc(docs_path: str, file_name: str, pdf_path: Optional[str] = None): max_docs = 1 logger.info("Parsing...") doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) doc = next(doc_preprocessor._parse_file(docs_path, file_name)) # Create an Parser and parse the md document parser_udf = get_parser_udf( structural=True, tabular=True, lingual=True, visual=True if pdf_path else False, pdf_path=pdf_path, language="en", ) doc = parser_udf.apply(doc) return doc
def mention_setup(): """Set up mentions.""" docs_path = "tests/data/html_simple/md.html" pdf_path = "tests/data/pdf_simple/" # Preprocessor for the Docs preprocessor = HTMLDocPreprocessor(docs_path) doc = next(preprocessor.__iter__()) # Create an Parser and parse the md document parser_udf = get_parser_udf( structural=True, tabular=True, lingual=True, visual=True, visual_parser=PdfVisualParser(pdf_path), language="en", ) doc = parser_udf.apply(doc) # Create 1-gram span mentions space = MentionNgrams(n_min=1, n_max=1) mentions = [tc for tc in space.apply(doc)] return mentions