def test_no_wellformed(testpath): """Test scraper without well-formed check.""" (_, tmppath) = tempfile.mkstemp() xml = """<?xml version="1.0" encoding="UTF-8" ?> <a>åäö</a>""".encode("utf-8") tmppath = os.path.join(testpath, "valid__.csv") with open(tmppath, "wb") as file_: file_.write(xml) scraper = LxmlScraper(tmppath, False) scraper.scrape_file() assert partial_message_included("Skipping scraper", scraper.messages()) assert scraper.well_formed is None
def test_forced_filetype(filename, result_dict, filetype, evaluate_scraper): """ Test using user-supplied MIME-types and versions. """ correct = force_correct_filetype(filename, result_dict, filetype, ["(:unav)"]) params = { "mimetype": filetype["given_mimetype"], "version": filetype["given_version"] } scraper = LxmlScraper(correct.filename, True, params) scraper.scrape_file() evaluate_scraper(scraper, correct)
def test_xml_encoding(testpath, file_encoding): """Test that encoding check from XML header works.""" enc_match = { "latin_1": u"ISO-8859-15", "utf_8": "UTF-8", "utf_16": "UTF-16" } xml = """<?xml version="1.0" encoding="{}" ?> <a>åäö</a>""".format(enc_match[file_encoding]) tmppath = os.path.join(testpath, "valid__.csv") with open(tmppath, "wb") as file_: file_.write(xml.encode(file_encoding)) scraper = LxmlScraper(tmppath, "text/xml") scraper.scrape_file() # assert scraper.streams[0]["charset"] == enc_match[file_encoding] assert scraper.well_formed
def test_xml_encoding(testpath, file_encoding): """ Test that encoding check from XML header works. :file_encoding: File character encoding """ enc_match = { "latin_1": u"ISO-8859-15", "utf_8": "UTF-8", "utf_16": "UTF-16" } xml = """<?xml version="1.0" encoding="{}" ?> <a>åäö</a>""".format(enc_match[file_encoding]) tmppath = os.path.join(testpath, "valid__.csv") with io.open(tmppath, "wb") as file_: file_.write(xml.encode(file_encoding)) scraper = LxmlScraper(filename=tmppath, mimetype="text/xml", params={"charset": enc_match[file_encoding]}) scraper.scrape_file() assert scraper.well_formed
def test_is_supported_deny(): """Test is_supported method for html 5.0 files.""" mime = "text/html" ver = "5.0" assert LxmlScraper.is_supported(mime, ver, True) assert LxmlScraper.is_supported(mime, None, True) assert not LxmlScraper.is_supported(mime, ver, True, {"schematron": "test"}) assert not LxmlScraper.is_supported(mime, ver, False) assert not LxmlScraper.is_supported(mime, "foo", True) assert not LxmlScraper.is_supported("foo", ver, True)
def test_charset(filename, mimetype, charset, well_formed): """ Test charset parameter. :filename: Test file name :mimetype: File MIME type :charset: File character encoding :well_formed: Expected result of well-formedness """ params = {"charset": charset} scraper = LxmlScraper(filename=filename, mimetype=mimetype, params=params) scraper.scrape_file() assert scraper.well_formed == well_formed if charset: if well_formed: assert not scraper.errors() else: assert partial_message_included("Found encoding declaration UTF-8", scraper.errors()) else: assert partial_message_included("encoding not defined", scraper.errors())