Python HTMLPreprocessor.generate примеры использования

Язык программирования: Python

Пространство имен/Пакет: fonduer

Класс/Тип: HTMLPreprocessor

Метод/Функция: generate

Примеров на hotexamples.com: 2

Python HTMLPreprocessor.generate - 2 примера найдено. Это лучшие примеры Python кода для fonduer.HTMLPreprocessor.generate, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HTMLPreprocessor(9)

generate(2)

Основные методы

HTMLPreprocessor (9)

generate (2)

Пример #1

Показать файл

Файл: test_parser.py Проект: Zinc-30/fonduer

def test_parse_structure(caplog):
    """Unit test of OmniParserUDF.parse_structure().

    This only tests the structural parse of the document.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    max_docs = 1
    docs_path = 'tests/data/html_simple/md.html'
    pdf_path = 'tests/data/pdf_simple/md.pdf'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Grab one document, text tuple from the preprocessor
    doc, text = next(preprocessor.generate())
    logger.info("    Text: {}".format(text))

    # Create an OmniParserUDF
    omni_udf = OmniParserUDF(
        True,  # structural
        ["style"],  # blacklist
        ["span", "br"],  # flatten
        '',  # flatten delim
        True,  # lingual
        True,  # strip
        [(u'[\u2010\u2011\u2012\u2013\u2014\u2212\uf02d]', '-')],  # replace
        True,  # tabular
        True,  # visual
        pdf_path,  # pdf path
        Spacy())  # lingual parser

    # Grab the phrases parsed by the OmniParser
    phrases = list(omni_udf.parse_structure(doc, text))

    logger.warning("Doc: {}".format(doc))
    for phrase in phrases:
        logger.warning("    Phrase: {}".format(phrase.text))

    header = phrases[0]
    # Test structural attributes
    assert header.xpath == '/html/body/h1'
    assert header.html_tag == 'h1'
    assert header.html_attrs == ['id=sample-markdown']

    # Test the unicode parse of delta
    assert (phrases[-1].text == "δ13Corg")

    # phrases expected in the "md" document.
    assert len(phrases) == 45

Пример #2

Показать файл

Файл: test_parser.py Проект: Zinc-30/fonduer

def test_parse_style(caplog):
    """Test style tag parsing."""
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    max_docs = 1
    docs_path = 'tests/data/html_extended/ext_diseases.html'
    pdf_path = 'tests/data/pdf_extended/ext_diseases.pdf'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Grab the document, text tuple from the preprocessor
    doc, text = next(preprocessor.generate())
    logger.info("    Text: {}".format(text))

    # Create an OmniParserUDF
    omni_udf = OmniParserUDF(
        True,           # structural
        [],             # blacklist, empty so that style is not blacklisted
        ["span", "br"],  # flatten
        '',             # flatten delim
        True,           # lingual
        True,           # strip
        [],             # replace
        True,           # tabular
        True,           # visual
        pdf_path,       # pdf path
        Spacy())        # lingual parser

    # Grab the phrases parsed by the OmniParser
    phrases = list(omni_udf.parse_structure(doc, text))

    logger.warning("Doc: {}".format(doc))
    for phrase in phrases:
        logger.warning("    Phrase: {}".format(phrase.html_attrs))

    # Phrases for testing
    sub_phrases = [
        {
            'index': 7,
            'attr': [
                'class=col-header',
                'hobbies=work:hard;play:harder',
                'type=phenotype',
                'style=background: #f1f1f1; color: aquamarine; font-size: 18px;'
            ]
        },
        {
            'index': 10,
            'attr': ['class=row-header', 'style=background: #f1f1f1;']
        },
        {
            'index': 12,
            'attr': ['class=cell', 'style=text-align: center;']
        }
    ]
    
    # Assertions
    assert(all(phrases[p['index']].html_attrs == p['attr'] for p in sub_phrases))