def test_invalid_css_selector_expr(element, expr): extractor = CSSExtractor(expr) with pytest.raises(ExprError) as catch: extractor.extract(element) exc = catch.value assert exc.extractor is extractor assert isinstance(exc.exc, SelectorSyntaxError)
def test_item_extract_failure_when_last_field_missing(element2, Article0, build_first): item = Article0(CSSExtractor("li.article"), is_many=True) assert not item.built assert not item.extractor.built assert not item.title.built assert not item.content.built if build_first: item.build() assert item.built assert item.extractor.built assert item.title.built assert item.content.built with pytest.raises(ExtractError) as catch: item.extract(element2) assert item.built assert item.extractor.built assert item.title.built assert item.content.built exc = catch.value assert len(exc.extractors) == 2 assert exc.extractors[0] is Article0.content assert exc.extractors[1] is item assert exc.element is element2.xpath("//li[@class='article'][2]")[0]
def test_item_extract(element1, Article0, build_first): item = Article0(CSSExtractor("li.article"), is_many=True) assert not item.built assert not item.extractor.built assert not item.title.built assert not item.content.built if build_first: item.build() assert item.built assert item.extractor.built assert item.title.built assert item.content.built assert item.extract(element1) == [ { "title": "Title 1", "content": "Content 1" }, { "title": "Title 2", "content": "Content 2" }, ] assert item.built assert item.extractor.built assert item.title.built assert item.content.built
def test_item_extract_success_without_is_many_when_last_field_missing( element2, Article0 ): item = Article0(CSSExtractor("li.article")) assert item.extract(element2) == { "title": "Title 1", "content": "Content 1", }
def test_invalid_css_selector_expr(element, expr): with pytest.raises(ExprError) as catch: CSSExtractor(expr) exc = catch.value # Third Party Library from cssselect.parser import SelectorError assert isinstance(exc.exc, SelectorError) assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
def test_item_extract_failure_when_last_field_missing(element2, Article0): extractor = Article0(CSSExtractor("li.article"), is_many=True) with pytest.raises(ExtractError) as catch: extractor.extract(element2) exc = catch.value assert len(exc.extractors) == 2 assert exc.extractors[0] is Article0.content assert exc.extractors[1] is extractor assert exc.element is element2.xpath("//li[@class='article'][2]")[0]
def test_item_extract(element1, Article0): assert Article0(CSSExtractor("li.article"), is_many=True).extract(element1) == [ { "title": "Title 1", "content": "Content 1" }, { "title": "Title 2", "content": "Content 2" }, ]
def test_missing_cssselect(): with pytest.raises(RuntimeError) as catch: CSSExtractor("a>b") assert "cssselect" in str(catch.value) with pytest.raises(RuntimeError) as catch: AttrCSSExtractor("a>b", "href") assert "cssselect" in str(catch.value) with pytest.raises(RuntimeError) as catch: TextCSSExtractor("a>b") assert "cssselect" in str(catch.value)
def test_invalid_css_selector_expr(element, expr, by): extractor = CSSExtractor(expr) assert not extractor.built with pytest.raises(ExprError) as catch: if by == "build": extractor.build() else: extractor.extract(element) assert not extractor.built exc = catch.value assert exc.extractor is extractor assert isinstance(exc.exc, SelectorError) assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
def test_item_extract_without_is_many(element1, Article0, build_first): item = Article0(CSSExtractor("li.article")) assert not item.built assert not item.extractor.built assert not item.title.built assert not item.content.built if build_first: item.build() assert item.built assert item.extractor.built assert item.title.built assert item.content.built assert item.extract(element1) == { "title": "Title 1", "content": "Content 1", } assert item.built assert item.extractor.built assert item.title.built assert item.content.built
def test_item_extract_success_without_is_many_when_last_field_missing( element2, Article0, build_first): item = Article0(CSSExtractor("li.article")) assert not item.built assert not item.extractor.built assert not item.title.built assert not item.content.built if build_first: item.build() assert item.built assert item.extractor.built assert item.title.built assert item.content.built assert item.extract(element2) == { "title": "Title 1", "content": "Content 1", } assert item.built assert item.extractor.built assert item.title.built assert item.content.built
assert str(ls) == "" string = "abc" assert str(ls) == "abc" @pytest.fixture(params=[Field(), Item()], ids=repr) def complex_extractor(request): return request.param @pytest.fixture( params=[ AttrCSSExtractor(expr="div.class", attr="id") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), CSSExtractor(expr="div.class") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), JSONPathExtractor(expr="boo") if not _missing_jsonpath else pytest.param("Missing 'jsonpath-extractor'", marks=pytest.mark.skip()), JSONPathRWExtractor(expr="boo") if not _missing_jsonpath_rw else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()), JSONPathRWExtExtractor(expr="boo") if not _missing_jsonpath_rw_ext else pytest.param("Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip()), TextCSSExtractor(expr="div.class") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), XPathExtractor(expr="//div") if not _missing_lxml else pytest.param( "Missing 'lxml'", marks=pytest.mark.skip()), ], ids=repr, ) def simple_extractor(request):
def test_complex_item_extract_xml_data(build_first): from lxml.etree import fromstring sample_rss_path = Path(__file__).parent / "assets" / "sample-rss-2.xml" text = sample_rss_path.read_text() element = fromstring(text) class ChannelItem(Item): title = Field(XPathExtractor("./title/text()"), default="") link = Field(XPathExtractor("./link/text()"), default="") description = Field(XPathExtractor("./description/text()")) publish_date = Field(XPathExtractor("./pubDate/text()")) guid = Field(XPathExtractor("./guid/text()")) class Channel(Item): title = Field(XPathExtractor("./title/text()")) link = Field(XPathExtractor("./link/text()")) description = Field(XPathExtractor("./description/text()")) language = Field(XPathExtractor("./language/text()")) publish_date = Field(XPathExtractor("./pubDate/text()")) last_build_date = Field(XPathExtractor("./lastBuildDate/text()")) docs = Field(XPathExtractor("./docs/text()")) generator = Field(XPathExtractor("./generator/text()")) managing_editor = Field(XPathExtractor("./managingEditor/text()")) web_master = Field(XPathExtractor("./webMaster/text()")) items = ChannelItem(XPathExtractor("./item"), is_many=True) items_result = [ { "title": "Star City", "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp", "description": ("How do Americans get ready to work " "with Russians aboard the International Space Station? " "They take a crash course in culture, " "language and protocol at Russia's " '<a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.' ), "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573", }, { "title": "", "link": "", "description": ( "Sky watchers in Europe, Asia, and parts of Alaska and Canada " "will experience a " '<a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">' # noqa: B950 "partial eclipse of the Sun" "</a> on Saturday, May 31st."), "publish_date": "Fri, 30 May 2003 11:06:42 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572", }, { "title": "The Engine That Does More", "link": "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp", "description": ("Before man travels to Mars, " "NASA hopes to design new engines " "that will let us fly through the Solar System more quickly. " "The proposed VASIMR engine would do that."), "publish_date": "Tue, 27 May 2003 08:37:32 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571", }, { "title": "Astronauts' Dirty Laundry", "link": "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp", "description": ("Compared to earlier spacecraft, " "the International Space Station has many luxuries, " "but laundry facilities are not one of them. " "Instead, astronauts have other options."), "publish_date": "Tue, 20 May 2003 08:56:02 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570", }, ] item = ChannelItem(CSSExtractor("channel>item")) if build_first: item.build() assert item.extract(element) == items_result[0] item = ChannelItem(CSSExtractor("channel>item"), is_many=True) if build_first: item.build() assert item.extract(element) == items_result item = Channel(XPathExtractor("//channel")) if build_first: item.build() assert item.extract(element) == { "title": "Liftoff News", "link": "http://liftoff.msfc.nasa.gov/", "description": "Liftoff to Space Exploration.", "language": "en-us", "publish_date": "Tue, 10 Jun 2003 04:00:00 GMT", "last_build_date": "Tue, 10 Jun 2003 09:41:01 GMT", "docs": "http://blogs.law.harvard.edu/tech/rss", "generator": "Weblog Editor 2.0", "managing_editor": "*****@*****.**", "web_master": "*****@*****.**", "items": items_result, }
def test_item_extract_without_is_many(element1, Article0): assert Article0(CSSExtractor("li.article")).extract(element1) == { "title": "Title 1", "content": "Content 1", }
string = "abc" assert str(ls) == "abc" @pytest.fixture(params=[Field(), Item()], ids=repr) def complex_extractor(request): return request.param @pytest.fixture( params=[ AttrCSSExtractor(expr="div.class", attr="id") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), CSSExtractor(expr="div.class") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), JSONPathExtractor(expr="boo") if not _missing_jsonpath else pytest.param( "Missing 'jsonpath-extractor'", marks=pytest.mark.skip() ), JSONPathRWExtractor(expr="boo") if not _missing_jsonpath_rw else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()), JSONPathRWExtExtractor(expr="boo") if not _missing_jsonpath_rw_ext else pytest.param( "Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip() ),