Пример #1
0
def test_parse_wo_tabular():
    """Test the parser without extracting tabular information."""
    docs_path = "tests/data/html_simple/md.html"
    pdf_path = "tests/data/pdf_simple/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        tabular=False,
        lingual=True,
        visual=True,
        visual_parser=PdfVisualParser(pdf_path),
        language="en",
    )
    doc = parser_udf.apply(doc)

    # Check that doc has neither table nor cell
    assert len(doc.sections) == 1
    assert len(doc.paragraphs) == 44
    assert len(doc.figures) == 1
    assert len(doc.tables) == 0
    assert len(doc.cells) == 0
    assert len(doc.sentences) == 45

    # Check that sentences are associated with both section and paragraph.
    assert all([sent.section is not None for sent in doc.sentences])
    assert all([sent.paragraph is not None for sent in doc.sentences])

    # Check that sentences are NOT associated with cell
    assert all([sent.cell is None for sent in doc.sentences])
Пример #2
0
def test_parse_multi_sections():
    """Test the parser with the radiology document."""
    # Test multi-section html
    docs_path = "tests/data/pure_html/radiology.html"
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "radiology"))
    parser_udf = get_parser_udf(structural=True,
                                tabular=True,
                                lingual=True,
                                visual=False)
    doc = parser_udf.apply(doc)

    assert len(doc.sections) == 5
    assert len(doc.paragraphs) == 30
    assert len(doc.sentences) == 35
    assert len(doc.figures) == 2

    assert doc.sections[0].name is None
    assert doc.sections[1].name == "label"
    assert doc.sections[2].name == "content"
    assert doc.sections[3].name == "image"

    assert doc.sections[2].paragraphs[0].name == "COMPARISON"
    assert doc.sections[2].paragraphs[1].name == "INDICATION"
    assert doc.sections[2].paragraphs[2].name == "FINDINGS"
    assert doc.sections[2].paragraphs[3].name == "IMPRESSION"
Пример #3
0
def parse_dataset(train=False, first_time=False):
    if train:
        session = config.init_session(config.conn_string_train)
    else:
        session = config.init_session(config.conn_string_predict)

    if not first_time:
        pass

    elif train:
        doc_preprocessor = HTMLDocPreprocessor(config.train_docs_path)
        corpus_parser = Parser(session,
                               structural=False,
                               lingual=True,
                               tabular=False,
                               replacements=[('\n', ' ')],
                               language='en_core_web_lg')
        corpus_parser.apply(doc_preprocessor, parallelism=config.PARALLEL)

    else:
        doc_preprocessor = HTMLDocPreprocessor(config.predict_docs_path)
        corpus_parser = Parser(session,
                               structural=False,
                               lingual=True,
                               tabular=False,
                               replacements=[('\n', ' ')],
                               language='en_core_web_lg')
        corpus_parser.apply(doc_preprocessor, parallelism=config.PARALLEL)

    return {
        'document_count': session.query(Document).count(),
        'sentence_count': session.query(Sentence).count(),
        'docs': session.query(Document).order_by(Document.name).all(),
        'session': session
    }
Пример #4
0
def test_spacy_german(caplog):
    """Test the parser with the md document."""
    caplog.set_level(logging.INFO)

    docs_path = "tests/data/pure_html/brot.html"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(structural=True,
                                tabular=True,
                                lingual=True,
                                visual=False,
                                language="de")
    for _ in parser_udf.apply(doc):
        pass

    # Check that doc has sentences
    assert len(doc.sentences) == 841
    sent = sorted(doc.sentences, key=lambda x: x.position)[143]
    assert sent.ner_tags == [
        "O",
        "O",
        "LOC",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
    ]  # inaccurate
    assert sent.dep_labels == [
        "mo",
        "ROOT",
        "sb",
        "cm",
        "nk",
        "mo",
        "punct",
        "mo",
        "nk",
        "nk",
        "nk",
        "sb",
        "oc",
        "rc",
        "punct",
    ]
Пример #5
0
def test_parse_style():
    """Test style tag parsing."""
    logger = logging.getLogger(__name__)

    docs_path = "tests/data/html_extended/ext_diseases.html"
    pdf_path = "tests/data/pdf_extended/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "ext_diseases"))

    # Create an Parser and parse the diseases document
    parser_udf = get_parser_udf(
        structural=True,
        lingual=True,
        visual=True,
        visual_parser=PdfVisualParser(pdf_path),
    )
    doc = parser_udf.apply(doc)

    # Grab the sentences parsed by the Parser
    sentences = doc.sentences

    logger.warning(f"Doc: {doc}")
    for i, sentence in enumerate(sentences):
        logger.warning(f"    Sentence[{i}]: {sentence.html_attrs}")

    # sentences for testing
    sub_sentences = [
        {
            "index":
            6,
            "attr": [
                "class=col-header",
                "hobbies=work:hard;play:harder",
                "type=phenotype",
                "style=background: #f1f1f1; color: aquamarine; font-size: 18px;",
            ],
        },
        {
            "index": 9,
            "attr": ["class=row-header", "style=background: #f1f1f1;"]
        },
        {
            "index": 11,
            "attr": ["class=cell", "style=text-align: center;"]
        },
    ]

    # Assertions
    assert all(sentences[p["index"]].html_attrs == p["attr"]
               for p in sub_sentences)
Пример #6
0
def test_spacy_japanese(caplog):
    """Test the parser with the md document."""
    caplog.set_level(logging.INFO)

    # Test Japanese alpha tokenization
    docs_path = "tests/data/pure_html/japan.html"
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))
    parser_udf = get_parser_udf(structural=True,
                                tabular=True,
                                lingual=True,
                                visual=False,
                                language="ja")
    for _ in parser_udf.apply(doc):
        pass

    assert len(doc.sentences) == 289
    sent = doc.sentences[42]
    assert sent.text == "当時マルコ・ポーロが辿り着いたと言われる"
    assert sent.words == [
        "当時", "マルコ", "・", "ポーロ", "が", "辿り着い", "た", "と", "言わ", "れる"
    ]
    assert sent.pos_tags == [
        "NOUN",
        "PROPN",
        "SYM",
        "PROPN",
        "ADP",
        "VERB",
        "AUX",
        "ADP",
        "VERB",
        "AUX",
    ]
    assert sent.lemmas == [
        "当時",
        "マルコ-Marco",
        "・",
        "ポーロ-Polo",
        "が",
        "辿り着く",
        "た",
        "と",
        "言う",
        "れる",
    ]
    # Japanese sentences are only tokenized.
    assert sent.ner_tags == [""] * len(sent.words)
    assert sent.dep_labels == [""] * len(sent.words)
Пример #7
0
def test_multimodal_cand(caplog):
    """Test multimodal candidate generation"""
    caplog.set_level(logging.INFO)

    PARALLEL = 4

    max_docs = 1
    session = Meta.init("postgresql://localhost:5432/" + DB).Session()

    docs_path = "tests/data/pure_html/radiology.html"

    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    corpus_parser = Parser(session, structural=True, lingual=True)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
    assert session.query(Document).count() == max_docs

    assert session.query(Sentence).count() == 35
    docs = session.query(Document).order_by(Document.name).all()

    # Mention Extraction

    ms_doc = mention_subclass("m_doc")
    ms_sec = mention_subclass("m_sec")
    ms_tab = mention_subclass("m_tab")
    ms_fig = mention_subclass("m_fig")
    ms_cell = mention_subclass("m_cell")
    ms_para = mention_subclass("m_para")
    ms_cap = mention_subclass("m_cap")
    ms_sent = mention_subclass("m_sent")

    m_doc = MentionDocuments()
    m_sec = MentionSections()
    m_tab = MentionTables()
    m_fig = MentionFigures()
    m_cell = MentionCells()
    m_para = MentionParagraphs()
    m_cap = MentionCaptions()
    m_sent = MentionSentences()

    ms = [ms_doc, ms_cap, ms_sec, ms_tab, ms_fig, ms_para, ms_sent, ms_cell]
    m = [m_doc, m_cap, m_sec, m_tab, m_fig, m_para, m_sent, m_cell]
    matchers = [DoNothingMatcher()] * 8

    mention_extractor = MentionExtractor(session,
                                         ms,
                                         m,
                                         matchers,
                                         parallelism=PARALLEL)

    mention_extractor.apply(docs)

    assert session.query(ms_doc).count() == 1
    assert session.query(ms_cap).count() == 2
    assert session.query(ms_sec).count() == 5
    assert session.query(ms_tab).count() == 2
    assert session.query(ms_fig).count() == 2
    assert session.query(ms_para).count() == 30
    assert session.query(ms_sent).count() == 35
    assert session.query(ms_cell).count() == 21
Пример #8
0
def test_spacy_japanese():
    """Test the parser with the md document."""
    # Test Japanese alpha tokenization
    docs_path = "tests/data/pure_html/japan.html"
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))
    parser_udf = get_parser_udf(structural=True,
                                tabular=True,
                                lingual=True,
                                visual=False,
                                language="ja")
    doc = parser_udf.apply(doc)

    assert len(doc.sentences) == 308
    sent = doc.sentences[45]
    assert sent.text == "当時マルコ・ポーロが辿り着いたと言われる"
    assert sent.words == [
        "当時",
        "マルコ",
        "・",
        "ポーロ",
        "が",
        "辿り",
        "着い",
        "た",
        "と",
        "言わ",
        "れる",
    ]
    assert sent.lemmas == [
        "当時",
        "マルコ",
        "・",
        "ポーロ",
        "が",
        "辿る",
        "着く",
        "た",
        "と",
        "言う",
        "れる",
    ]
    # These tags are less stable (ie they change when a spacy model changes)
    # So just check that values other than "" are assigned.
    assert all(sent.pos_tags)
    assert all(sent.ner_tags)
    assert all(sent.dep_labels)
Пример #9
0
def test_warning_on_incorrect_filename():
    """Test that a warning is issued on invalid pdf."""
    docs_path = "tests/data/html_simple/md_para.html"
    pdf_path = "tests/data/html_simple/md_para.html"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md_para"))

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True, tabular=True, lingual=True, visual=True, pdf_path=pdf_path
    )
    with pytest.warns(RuntimeWarning) as record:
        doc = parser_udf.apply(doc)
    assert len(record) == 1
    assert "Visual parse failed" in record[0].message.args[0]
Пример #10
0
def test_visual_linker_not_affected_by_order_of_sentences():
    """Test if visual_linker result is not affected by the order of sentences."""
    docs_path = "tests/data/html/2N6427.html"
    pdf_path = "tests/data/pdf/2N6427.pdf"

    # Initialize preprocessor, parser, visual_linker.
    # Note that parser is initialized with `visual=False` and that visual_linker
    # will be used to attach "visual" information to sentences after parsing.
    preprocessor = HTMLDocPreprocessor(docs_path)
    parser_udf = get_parser_udf(structural=True,
                                lingual=False,
                                tabular=True,
                                visual=False)
    visual_linker = VisualLinker(pdf_path=pdf_path)

    doc = parser_udf.apply(next(preprocessor.__iter__()))
    # Sort sentences by sentence.position
    doc.sentences = sorted(doc.sentences, key=attrgetter("position"))
    sentences0 = [
        sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path)
    ]
    # Sort again in case visual_linker.link changes the order
    sentences0 = sorted(sentences0, key=attrgetter("position"))

    doc = parser_udf.apply(next(preprocessor.__iter__()))
    # Shuffle
    random.shuffle(doc.sentences)
    sentences1 = [
        sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path)
    ]
    # Sort sentences by sentence.position
    sentences1 = sorted(sentences1, key=attrgetter("position"))

    # This should hold as both sentences are sorted by their position
    assert all([
        sent0.position == sent1.position
        for (sent0, sent1) in zip(sentences0, sentences1)
    ])

    # The following assertion should hold if the visual_linker result is not affected
    # by the order of sentences.
    assert all([
        sent0.left == sent1.left
        for (sent0, sent1) in zip(sentences0, sentences1)
    ])
Пример #11
0
def test_spacy_chinese():
    """Test the parser with the md document."""
    # Test Chinese alpha tokenization
    docs_path = "tests/data/pure_html/chinese.html"
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))
    parser_udf = get_parser_udf(
        structural=True, tabular=True, lingual=True, visual=False, language="zh"
    )
    doc = parser_udf.apply(doc)

    assert len(doc.sentences) == 8
    sent = doc.sentences[1]
    assert sent.text == "我们和他对比谁更厉害!"
    assert sent.words == ["我们", "和", "他", "对比", "谁", "更", "厉害", "!"]
    # Chinese sentences are only tokenized.
    assert sent.ner_tags == ["", "", "", "", "", "", "", ""]
    assert sent.dep_labels == ["", "", "", "", "", "", "", ""]
Пример #12
0
def parse_doc(docs_path: str, file_name: str, pdf_path: Optional[str] = None):
    max_docs = 1

    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    doc = next(doc_preprocessor._parse_file(docs_path, file_name))

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        tabular=True,
        lingual=True,
        visual=True if pdf_path else False,
        pdf_path=pdf_path,
        language="en",
    )
    doc = parser_udf.apply(doc)
    return doc
Пример #13
0
def parse(html_location, database):
    """
    Wrapper function for calling Fonduer parser.
    :param html_location: HTML files generated by ``parse_preprocess.py``.
    :param database: db connection string.
    """
    session = Meta.init(database).Session()
    doc_preprocessor = HTMLDocPreprocessor(html_location)
    corpus_parser = Parser(session, structural=True, lingual=True)
    corpus_parser.apply(doc_preprocessor)
Пример #14
0
def test_simple_parser():
    """Unit test of Parser on a single document with lingual features off."""
    logger = logging.getLogger(__name__)

    docs_path = "tests/data/html_simple/md.html"
    pdf_path = "tests/data/pdf_simple/md.pdf"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))

    # Check that doc has a name
    assert doc.name == "md"

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        lingual=False,
        visual=True,
        pdf_path=pdf_path,
        lingual_parser=SimpleParser(delim="NoDelim"),
    )
    for _ in parser_udf.apply(doc):
        pass

    logger.info(f"Doc: {doc}")
    for i, sentence in enumerate(doc.sentences):
        logger.info(f"    Sentence[{i}]: {sentence.text}")

    header = sorted(doc.sentences, key=lambda x: x.position)[0]
    # Test structural attributes
    assert header.xpath == "/html/body/h1"
    assert header.html_tag == "h1"
    assert header.html_attrs == ["id=sample-markdown"]

    # Test lingual attributes
    assert header.ner_tags == ["", ""]
    assert header.dep_labels == ["", ""]
    assert header.dep_parents == [0, 0]
    assert header.lemmas == ["", ""]
    assert header.pos_tags == ["", ""]

    assert len(doc.sentences) == 44
Пример #15
0
def test_parse_error_doc_skipping(database_session):
    """Test skipping of faulty htmls."""
    faulty_doc_path = "tests/data/html_faulty/ext_diseases_missing_table_tag.html"
    preprocessor = HTMLDocPreprocessor(faulty_doc_path)
    session = database_session
    corpus_parser = Parser(session)
    corpus_parser.apply(preprocessor)
    # This returns documents that apply() was called on
    assert corpus_parser.last_docs == {"ext_diseases_missing_table_tag"}
    # This returns only documents that are successfully parsed.
    assert corpus_parser.get_last_documents() == []
Пример #16
0
def test_warning_on_missing_pdf(caplog):
    """Test that a warning is issued on invalid pdf."""
    caplog.set_level(logging.INFO)

    docs_path = "tests/data/html_simple/md_para.html"
    pdf_path = "tests/data/pdf_simple/md_para_nonexistant.pdf"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md_para"))

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True, tabular=True, lingual=True, visual=True, pdf_path=pdf_path
    )
    with pytest.warns(RuntimeWarning) as record:
        for _ in parser_udf.apply(doc):
            pass
    assert len(record) == 1
    assert "Visual parse failed" in record[0].message.args[0]
Пример #17
0
def test_warning_on_incorrect_filename():
    """Test that a warning is issued on invalid pdf."""
    docs_path = "tests/data/html_simple/md_para.html"
    pdf_path = "tests/data/html_simple/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md_para"))

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        tabular=True,
        lingual=True,
        visual=True,
        visual_parser=PdfVisualParser(pdf_path),
    )
    with pytest.warns(RuntimeWarning) as record:
        doc = parser_udf.apply(doc)
    assert isinstance(record, type(pytest.warns(RuntimeWarning)))
Пример #18
0
def test_spacy_japanese(caplog):
    """Test the parser with the md document."""
    caplog.set_level(logging.INFO)

    # Test japanese alpha tokenization
    docs_path = "tests/data/pure_html/japan.html"
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))
    parser_udf = get_parser_udf(
        structural=True, tabular=True, lingual=True, visual=False, language="ja"
    )
    for _ in parser_udf.apply(doc):
        pass

    assert len(doc.sentences) == 289
    sent = doc.sentences[0]
    assert sent.text == "ジャパン-Wikipedia"
    assert sent.words == ["ジャパン", "-", "Wikipedia"]
    # Japanese sentences are only tokenized.
    assert sent.ner_tags == ["", "", ""]
    assert sent.dep_labels == ["", "", ""]
Пример #19
0
def test_parse_table_span():
    logger = logging.getLogger(__name__)

    docs_path = "tests/data/html_simple/table_span.html"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "table_span"))

    # Check that doc has a name
    assert doc.name == "table_span"

    # Create an Parser and parse the document
    parser_udf = get_parser_udf(structural=True, lingual=True, visual=False)
    doc = parser_udf.apply(doc)

    logger.info(f"Doc: {doc}")

    assert len(doc.sentences) == 1
    for sentence in doc.sentences:
        logger.info(f"    Sentence: {sentence.text}")
Пример #20
0
def test_spacy_chinese():
    """Test the parser with the md document."""
    # Test Chinese alpha tokenization
    docs_path = "tests/data/pure_html/chinese.html"
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "md"))
    parser_udf = get_parser_udf(structural=True,
                                tabular=True,
                                lingual=True,
                                visual=False,
                                language="zh")
    doc = parser_udf.apply(doc)

    assert len(doc.sentences) == 8
    sent = doc.sentences[1]
    assert sent.text == "我们和他对比谁更厉害!"
    assert sent.words == ["我们", "和", "他", "对比", "谁", "更", "厉害", "!"]
    # These tags are less stable (ie they change when a spacy model changes)
    # So just check that values other than "" are assigned.
    assert all(sent.ner_tags)
    assert all(sent.dep_labels)
Пример #21
0
def test_parse_style(caplog):
    """Test style tag parsing."""
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init("postgres://localhost:5432/" + ATTRIBUTE).Session()

    # SpaCy on mac has issue on parallel parseing
    if os.name == "posix":
        PARALLEL = 1
    else:
        PARALLEL = 2  # Travis only gives 2 cores

    max_docs = 1
    docs_path = "tests/data/html_extended/ext_diseases.html"
    pdf_path = "tests/data/pdf_extended/ext_diseases.pdf"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    # Create an Parser and parse the md document
    omni = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
    omni.apply(preprocessor, parallelism=PARALLEL)

    # Grab the document
    doc = session.query(Document).order_by(Document.name).all()[0]

    # Grab the sentences parsed by the Parser
    sentences = list(session.query(Sentence).order_by(Sentence.position).all())

    logger.warning("Doc: {}".format(doc))
    for i, sentence in enumerate(sentences):
        logger.warning("    Sentence[{}]: {}".format(i, sentence.html_attrs))

    # sentences for testing
    sub_sentences = [
        {
            "index": 6,
            "attr": [
                "class=col-header",
                "hobbies=work:hard;play:harder",
                "type=phenotype",
                "style=background: #f1f1f1; color: aquamarine; font-size: 18px;",
            ],
        },
        {"index": 9, "attr": ["class=row-header", "style=background: #f1f1f1;"]},
        {"index": 11, "attr": ["class=cell", "style=text-align: center;"]},
    ]

    # Assertions
    assert all(sentences[p["index"]].html_attrs == p["attr"] for p in sub_sentences)
Пример #22
0
def parse(html_location, database, parallelism=1):
    """
    Wrapper function for calling Fonduer parser
    :param html_location: HTML files generated by ``parse_preprocess.py``
    :param database: DB connection string
    :param parallelism: Number of cores to use
    """
    session = Meta.init(database).Session()
    doc_preprocessor = HTMLDocPreprocessor(html_location)
    corpus_parser = Parser(session,
                           structural=True,
                           lingual=True,
                           parallelism=parallelism)
    corpus_parser.apply(doc_preprocessor)
Пример #23
0
def test_parser_no_image():
    """Unit test of Parser on a single document that has a figure without image."""
    docs_path = "tests/data/html_simple/no_image.html"
    pdf_path = "tests/data/pdf_simple/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor._parse_file(docs_path, "no_image"))

    # Check that doc has a name
    assert doc.name == "no_image"

    # Create an Parser and parse the no_image document
    parser_udf = get_parser_udf(
        structural=True,
        lingual=False,
        visual=True,
        visual_parser=PdfVisualParser(pdf_path),
    )
    doc = parser_udf.apply(doc)

    # Check that doc has no figures
    assert len(doc.figures) == 0
Пример #24
0
def test_visualizer(caplog):
    from fonduer.utils.visualizer import Visualizer  # noqa
    """Unit test of visualizer using the md document.
    """
    caplog.set_level(logging.INFO)
    session = Meta.init("postgresql://localhost:5432/" + ATTRIBUTE).Session()

    PARALLEL = 1
    max_docs = 1
    docs_path = "tests/data/html_simple/md.html"
    pdf_path = "tests/data/pdf_simple/md.pdf"

    # Preprocessor for the Docs
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    # Create an Parser and parse the md document
    corpus_parser = Parser(session,
                           structural=True,
                           lingual=True,
                           visual=True,
                           pdf_path=pdf_path)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

    # Grab the md document
    doc = session.query(Document).order_by(Document.name).all()[0]
    assert doc.name == "md"

    organization_ngrams = MentionNgrams(n_max=1)

    Org = mention_subclass("Org")

    organization_matcher = OrganizationMatcher()

    mention_extractor = MentionExtractor(session, [Org], [organization_ngrams],
                                         [organization_matcher])

    mention_extractor.apply([doc], parallelism=PARALLEL)

    Organization = candidate_subclass("Organization", [Org])

    candidate_extractor = CandidateExtractor(session, [Organization])

    candidate_extractor.apply([doc], split=0, parallelism=PARALLEL)

    cands = session.query(Organization).filter(Organization.split == 0).all()

    # Test visualizer
    pdf_path = "tests/data/pdf_simple"
    vis = Visualizer(pdf_path)
    vis.display_candidates([cands[0]])
Пример #25
0
def mention_setup():
    """Set up mentions."""
    docs_path = "tests/data/html_simple/md.html"
    pdf_path = "tests/data/pdf_simple/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor.__iter__())

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        tabular=True,
        lingual=True,
        visual=True,
        visual_parser=PdfVisualParser(pdf_path),
        language="en",
    )
    doc = parser_udf.apply(doc)

    # Create 1-gram span mentions
    space = MentionNgrams(n_min=1, n_max=1)
    mentions = [tc for tc in space.apply(doc)]
    return mentions
Пример #26
0
def parse(session: Session, docs_path: str, pdf_path: str) -> List[Document]:
    """Parse documents using Parser UDF Runner."""
    # Preprocessor for the Docs
    doc_preprocessor = HTMLDocPreprocessor(docs_path)

    # Create an Parser and parse the documents
    corpus_parser = Parser(
        session,
        parallelism=1,
        structural=True,
        lingual=True,
        visual_parser=PdfVisualParser(pdf_path),
    )

    corpus_parser.clear()
    corpus_parser.apply(doc_preprocessor)
    return corpus_parser.get_documents()
Пример #27
0
    def parse(docs_path, pdf_path):
        # Preprocessor for the Docs
        doc_preprocessor = HTMLDocPreprocessor(docs_path)

        # Create an Parser and parse the documents
        corpus_parser = Parser(
            session,
            parallelism=1,
            structural=True,
            lingual=True,
            visual=True,
            pdf_path=pdf_path,
        )

        corpus_parser.clear()
        corpus_parser.apply(doc_preprocessor)
        return corpus_parser
Пример #28
0
def setup_common_components():
    """Set up mention/candidate extractor."""
    preprocessor = HTMLDocPreprocessor("tests/data/html/")
    parser = Parser(None)
    mention_extractor = MentionExtractor(
        None,
        [Part, Temp],
        [MentionNgramsPart(parts_by_doc=None, n_max=3), MentionNgramsTemp(n_max=2)],
        [part_matcher, temp_matcher],
    )
    candidate_extractor = CandidateExtractor(None, [PartTemp], [temp_throttler])
    return {
        "preprocessor": preprocessor,
        "parser": parser,
        "mention_extractor": mention_extractor,
        "candidate_extractor": candidate_extractor,
    }
Пример #29
0
def test_simple_tokenizer(caplog):
    """Unit test of Parser on a single document with lingual features off."""
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init("postgres://localhost:5432/" + ATTRIBUTE).Session()

    # SpaCy on mac has issue on parallel parseing
    if os.name == "posix":
        PARALLEL = 1
    else:
        PARALLEL = 2  # Travis only gives 2 cores

    max_docs = 2
    docs_path = "tests/data/html_simple/"
    pdf_path = "tests/data/pdf_simple/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    parser = Parser(structural=True,
                    lingual=False,
                    visual=True,
                    pdf_path=pdf_path)
    parser.apply(preprocessor, parallelism=PARALLEL)

    doc = session.query(Document).order_by(Document.name).all()[1]

    logger.info("Doc: {}".format(doc))
    for i, sentence in enumerate(doc.sentences):
        logger.info("    Sentence[{}]: {}".format(i, sentence.text))

    header = sorted(doc.sentences, key=lambda x: x.position)[0]
    # Test structural attributes
    assert header.xpath == "/html/body/h1"
    assert header.html_tag == "h1"
    assert header.html_attrs == ["id=sample-markdown"]

    # Test lingual attributes
    assert header.ner_tags == ["", ""]
    assert header.dep_labels == ["", ""]
    assert header.dep_parents == [0, 0]
    assert header.lemmas == ["", ""]
    assert header.pos_tags == ["", ""]

    assert len(doc.sentences) == 44
Пример #30
0
def test_ngrams(caplog):
    """Test ngram limits in mention extraction"""
    caplog.set_level(logging.INFO)

    PARALLEL = 4

    max_docs = 1
    session = Meta.init("postgresql://localhost:5432/" + DB).Session()

    docs_path = "tests/data/pure_html/lincoln_short.html"

    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    corpus_parser = Parser(session, structural=True, lingual=True)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
    assert session.query(Document).count() == max_docs
    assert session.query(Sentence).count() == 503
    docs = session.query(Document).order_by(Document.name).all()

    # Mention Extraction
    Person = mention_subclass("Person")
    person_ngrams = MentionNgrams(n_max=3)
    person_matcher = PersonMatcher()

    mention_extractor = MentionExtractor(
        session, [Person], [person_ngrams], [person_matcher]
    )
    mention_extractor.apply(docs, parallelism=PARALLEL)

    assert session.query(Person).count() == 118
    mentions = session.query(Person).all()
    assert len([x for x in mentions if x.context.get_num_words() == 1]) == 49
    assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0

    # Test for unigram exclusion
    person_ngrams = MentionNgrams(n_min=2, n_max=3)
    mention_extractor = MentionExtractor(
        session, [Person], [person_ngrams], [person_matcher]
    )
    mention_extractor.apply(docs, parallelism=PARALLEL)
    assert session.query(Person).count() == 69
    mentions = session.query(Person).all()
    assert len([x for x in mentions if x.context.get_num_words() == 1]) == 0
    assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0