Пример #1
0
def test_parse_structure(caplog):
    """Unit test of OmniParserUDF.parse_structure().

    This only tests the structural parse of the document.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    max_docs = 1
    docs_path = 'tests/data/html_simple/md.html'
    pdf_path = 'tests/data/pdf_simple/md.pdf'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Grab one document, text tuple from the preprocessor
    doc, text = next(preprocessor.generate())
    logger.info("    Text: {}".format(text))

    # Create an OmniParserUDF
    omni_udf = OmniParserUDF(
        True,  # structural
        ["style"],  # blacklist
        ["span", "br"],  # flatten
        '',  # flatten delim
        True,  # lingual
        True,  # strip
        [(u'[\u2010\u2011\u2012\u2013\u2014\u2212\uf02d]', '-')],  # replace
        True,  # tabular
        True,  # visual
        pdf_path,  # pdf path
        Spacy())  # lingual parser

    # Grab the phrases parsed by the OmniParser
    phrases = list(omni_udf.parse_structure(doc, text))

    logger.warning("Doc: {}".format(doc))
    for phrase in phrases:
        logger.warning("    Phrase: {}".format(phrase.text))

    header = phrases[0]
    # Test structural attributes
    assert header.xpath == '/html/body/h1'
    assert header.html_tag == 'h1'
    assert header.html_attrs == ['id=sample-markdown']

    # Test the unicode parse of delta
    assert (phrases[-1].text == "δ13Corg")

    # phrases expected in the "md" document.
    assert len(phrases) == 45
Пример #2
0
def test_parse_style(caplog):
    """Test style tag parsing."""
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    max_docs = 1
    docs_path = 'tests/data/html_extended/ext_diseases.html'
    pdf_path = 'tests/data/pdf_extended/ext_diseases.pdf'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Grab the document, text tuple from the preprocessor
    doc, text = next(preprocessor.generate())
    logger.info("    Text: {}".format(text))

    # Create an OmniParserUDF
    omni_udf = OmniParserUDF(
        True,           # structural
        [],             # blacklist, empty so that style is not blacklisted
        ["span", "br"],  # flatten
        '',             # flatten delim
        True,           # lingual
        True,           # strip
        [],             # replace
        True,           # tabular
        True,           # visual
        pdf_path,       # pdf path
        Spacy())        # lingual parser

    # Grab the phrases parsed by the OmniParser
    phrases = list(omni_udf.parse_structure(doc, text))

    logger.warning("Doc: {}".format(doc))
    for phrase in phrases:
        logger.warning("    Phrase: {}".format(phrase.html_attrs))

    # Phrases for testing
    sub_phrases = [
        {
            'index': 7,
            'attr': [
                'class=col-header',
                'hobbies=work:hard;play:harder',
                'type=phenotype',
                'style=background: #f1f1f1; color: aquamarine; font-size: 18px;'
            ]
        },
        {
            'index': 10,
            'attr': ['class=row-header', 'style=background: #f1f1f1;']
        },
        {
            'index': 12,
            'attr': ['class=cell', 'style=text-align: center;']
        }
    ]
    
    # Assertions
    assert(all(phrases[p['index']].html_attrs == p['attr'] for p in sub_phrases))