Python CSSExtractor示例，data_extractor.lxml.CSSExtractor Python示例

示例#1

0

显示文件

文件： test_lxml.py 项目： NyntoFive/data_extractor

def test_invalid_css_selector_expr(element, expr):
    extractor = CSSExtractor(expr)
    with pytest.raises(ExprError) as catch:
        extractor.extract(element)

    exc = catch.value
    assert exc.extractor is extractor
    assert isinstance(exc.exc, SelectorSyntaxError)

示例#2

0

显示文件

def test_item_extract_failure_when_last_field_missing(element2, Article0,
                                                      build_first):
    item = Article0(CSSExtractor("li.article"), is_many=True)
    assert not item.built
    assert not item.extractor.built
    assert not item.title.built
    assert not item.content.built
    if build_first:
        item.build()
        assert item.built
        assert item.extractor.built
        assert item.title.built
        assert item.content.built

    with pytest.raises(ExtractError) as catch:
        item.extract(element2)

    assert item.built
    assert item.extractor.built
    assert item.title.built
    assert item.content.built

    exc = catch.value
    assert len(exc.extractors) == 2
    assert exc.extractors[0] is Article0.content
    assert exc.extractors[1] is item
    assert exc.element is element2.xpath("//li[@class='article'][2]")[0]

示例#3

0

显示文件

def test_item_extract(element1, Article0, build_first):
    item = Article0(CSSExtractor("li.article"), is_many=True)
    assert not item.built
    assert not item.extractor.built
    assert not item.title.built
    assert not item.content.built
    if build_first:
        item.build()
        assert item.built
        assert item.extractor.built
        assert item.title.built
        assert item.content.built

    assert item.extract(element1) == [
        {
            "title": "Title 1",
            "content": "Content 1"
        },
        {
            "title": "Title 2",
            "content": "Content 2"
        },
    ]
    assert item.built
    assert item.extractor.built
    assert item.title.built
    assert item.content.built

示例#4

0

显示文件

文件： test_item.py 项目： linw1995/data_extractor

def test_item_extract_success_without_is_many_when_last_field_missing(
    element2, Article0
):
    item = Article0(CSSExtractor("li.article"))
    assert item.extract(element2) == {
        "title": "Title 1",
        "content": "Content 1",
    }

示例#5

0

显示文件

文件： test_lxml.py 项目： linw1995/data_extractor

def test_invalid_css_selector_expr(element, expr):
    with pytest.raises(ExprError) as catch:
        CSSExtractor(expr)

    exc = catch.value
    # Third Party Library
    from cssselect.parser import SelectorError

    assert isinstance(exc.exc, SelectorError)
    assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))

示例#6

0

显示文件

文件： test_item.py 项目： NyntoFive/data_extractor

def test_item_extract_failure_when_last_field_missing(element2, Article0):
    extractor = Article0(CSSExtractor("li.article"), is_many=True)
    with pytest.raises(ExtractError) as catch:
        extractor.extract(element2)

    exc = catch.value
    assert len(exc.extractors) == 2
    assert exc.extractors[0] is Article0.content
    assert exc.extractors[1] is extractor
    assert exc.element is element2.xpath("//li[@class='article'][2]")[0]

示例#7

0

显示文件

文件： test_item.py 项目： NyntoFive/data_extractor

def test_item_extract(element1, Article0):
    assert Article0(CSSExtractor("li.article"),
                    is_many=True).extract(element1) == [
                        {
                            "title": "Title 1",
                            "content": "Content 1"
                        },
                        {
                            "title": "Title 2",
                            "content": "Content 2"
                        },
                    ]

示例#8

0

显示文件

文件： test_utils.py 项目： linw1995/data_extractor

def test_missing_cssselect():
    with pytest.raises(RuntimeError) as catch:
        CSSExtractor("a>b")

    assert "cssselect" in str(catch.value)

    with pytest.raises(RuntimeError) as catch:
        AttrCSSExtractor("a>b", "href")

    assert "cssselect" in str(catch.value)

    with pytest.raises(RuntimeError) as catch:
        TextCSSExtractor("a>b")

    assert "cssselect" in str(catch.value)

示例#9

0

显示文件

文件： test_lxml.py 项目： walison17/data_extractor

def test_invalid_css_selector_expr(element, expr, by):
    extractor = CSSExtractor(expr)
    assert not extractor.built
    with pytest.raises(ExprError) as catch:
        if by == "build":
            extractor.build()
        else:
            extractor.extract(element)

    assert not extractor.built
    exc = catch.value
    assert exc.extractor is extractor
    assert isinstance(exc.exc, SelectorError)
    assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))

示例#10

0

显示文件

def test_item_extract_without_is_many(element1, Article0, build_first):
    item = Article0(CSSExtractor("li.article"))
    assert not item.built
    assert not item.extractor.built
    assert not item.title.built
    assert not item.content.built
    if build_first:
        item.build()
        assert item.built
        assert item.extractor.built
        assert item.title.built
        assert item.content.built

    assert item.extract(element1) == {
        "title": "Title 1",
        "content": "Content 1",
    }
    assert item.built
    assert item.extractor.built
    assert item.title.built
    assert item.content.built

示例#11

0

显示文件

def test_item_extract_success_without_is_many_when_last_field_missing(
        element2, Article0, build_first):
    item = Article0(CSSExtractor("li.article"))
    assert not item.built
    assert not item.extractor.built
    assert not item.title.built
    assert not item.content.built
    if build_first:
        item.build()
        assert item.built
        assert item.extractor.built
        assert item.title.built
        assert item.content.built

    assert item.extract(element2) == {
        "title": "Title 1",
        "content": "Content 1",
    }
    assert item.built
    assert item.extractor.built
    assert item.title.built
    assert item.content.built

示例#12

0

显示文件

文件： test_utils.py 项目： linw1995/data_extractor

    assert str(ls) == ""

    string = "abc"
    assert str(ls) == "abc"


@pytest.fixture(params=[Field(), Item()], ids=repr)
def complex_extractor(request):
    return request.param


@pytest.fixture(
    params=[
        AttrCSSExtractor(expr="div.class", attr="id") if not _missing_cssselect
        else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        CSSExtractor(expr="div.class") if not _missing_cssselect else
        pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        JSONPathExtractor(expr="boo") if not _missing_jsonpath else
        pytest.param("Missing 'jsonpath-extractor'", marks=pytest.mark.skip()),
        JSONPathRWExtractor(expr="boo") if not _missing_jsonpath_rw else
        pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()),
        JSONPathRWExtExtractor(expr="boo") if not _missing_jsonpath_rw_ext else
        pytest.param("Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip()),
        TextCSSExtractor(expr="div.class") if not _missing_cssselect else
        pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        XPathExtractor(expr="//div") if not _missing_lxml else pytest.param(
            "Missing 'lxml'", marks=pytest.mark.skip()),
    ],
    ids=repr,
)
def simple_extractor(request):

示例#13

0

显示文件

def test_complex_item_extract_xml_data(build_first):
    from lxml.etree import fromstring

    sample_rss_path = Path(__file__).parent / "assets" / "sample-rss-2.xml"
    text = sample_rss_path.read_text()
    element = fromstring(text)

    class ChannelItem(Item):
        title = Field(XPathExtractor("./title/text()"), default="")
        link = Field(XPathExtractor("./link/text()"), default="")
        description = Field(XPathExtractor("./description/text()"))
        publish_date = Field(XPathExtractor("./pubDate/text()"))
        guid = Field(XPathExtractor("./guid/text()"))

    class Channel(Item):
        title = Field(XPathExtractor("./title/text()"))
        link = Field(XPathExtractor("./link/text()"))
        description = Field(XPathExtractor("./description/text()"))
        language = Field(XPathExtractor("./language/text()"))
        publish_date = Field(XPathExtractor("./pubDate/text()"))
        last_build_date = Field(XPathExtractor("./lastBuildDate/text()"))
        docs = Field(XPathExtractor("./docs/text()"))
        generator = Field(XPathExtractor("./generator/text()"))
        managing_editor = Field(XPathExtractor("./managingEditor/text()"))
        web_master = Field(XPathExtractor("./webMaster/text()"))

        items = ChannelItem(XPathExtractor("./item"), is_many=True)

    items_result = [
        {
            "title":
            "Star City",
            "link":
            "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
            "description":
            ("How do Americans get ready to work "
             "with Russians aboard the International Space Station? "
             "They take a crash course in culture, "
             "language and protocol at Russia's "
             '<a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.'
             ),
            "publish_date":
            "Tue, 03 Jun 2003 09:39:21 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
        },
        {
            "title":
            "",
            "link":
            "",
            "description": (
                "Sky watchers in Europe, Asia, and parts of Alaska and Canada "
                "will experience a "
                '<a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">'  # noqa: B950
                "partial eclipse of the Sun"
                "</a> on Saturday, May 31st."),
            "publish_date":
            "Fri, 30 May 2003 11:06:42 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
        },
        {
            "title":
            "The Engine That Does More",
            "link":
            "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
            "description":
            ("Before man travels to Mars, "
             "NASA hopes to design new engines "
             "that will let us fly through the Solar System more quickly.  "
             "The proposed VASIMR engine would do that."),
            "publish_date":
            "Tue, 27 May 2003 08:37:32 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571",
        },
        {
            "title":
            "Astronauts' Dirty Laundry",
            "link":
            "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
            "description":
            ("Compared to earlier spacecraft, "
             "the International Space Station has many luxuries, "
             "but laundry facilities are not one of them.  "
             "Instead, astronauts have other options."),
            "publish_date":
            "Tue, 20 May 2003 08:56:02 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570",
        },
    ]
    item = ChannelItem(CSSExtractor("channel>item"))
    if build_first:
        item.build()
    assert item.extract(element) == items_result[0]

    item = ChannelItem(CSSExtractor("channel>item"), is_many=True)
    if build_first:
        item.build()
    assert item.extract(element) == items_result

    item = Channel(XPathExtractor("//channel"))
    if build_first:
        item.build()
    assert item.extract(element) == {
        "title": "Liftoff News",
        "link": "http://liftoff.msfc.nasa.gov/",
        "description": "Liftoff to Space Exploration.",
        "language": "en-us",
        "publish_date": "Tue, 10 Jun 2003 04:00:00 GMT",
        "last_build_date": "Tue, 10 Jun 2003 09:41:01 GMT",
        "docs": "http://blogs.law.harvard.edu/tech/rss",
        "generator": "Weblog Editor 2.0",
        "managing_editor": "*****@*****.**",
        "web_master": "*****@*****.**",
        "items": items_result,
    }

示例#14

0

显示文件

文件： test_item.py 项目： NyntoFive/data_extractor

def test_item_extract_without_is_many(element1, Article0):
    assert Article0(CSSExtractor("li.article")).extract(element1) == {
        "title": "Title 1",
        "content": "Content 1",
    }

示例#15

0

显示文件

文件： test_utils.py 项目： x0rzkov/data_extractor

    string = "abc"
    assert str(ls) == "abc"


@pytest.fixture(params=[Field(), Item()], ids=repr)
def complex_extractor(request):
    return request.param


@pytest.fixture(
    params=[
        AttrCSSExtractor(expr="div.class", attr="id")
        if not _missing_cssselect
        else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        CSSExtractor(expr="div.class")
        if not _missing_cssselect
        else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        JSONPathExtractor(expr="boo")
        if not _missing_jsonpath
        else pytest.param(
            "Missing 'jsonpath-extractor'", marks=pytest.mark.skip()
        ),
        JSONPathRWExtractor(expr="boo")
        if not _missing_jsonpath_rw
        else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()),
        JSONPathRWExtExtractor(expr="boo")
        if not _missing_jsonpath_rw_ext
        else pytest.param(
            "Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip()
        ),