def test_scraper_tif(filename, result_dict, evaluate_scraper): """ Test scraper with tiff files. :filename: Test file name :result_dict: Result dict containing the test purpose, parts of expected results of stdout and stderr, and expected streams """ correct = parse_results(filename, "image/tiff", result_dict, True) if correct.well_formed: correct.stdout_part = VALID_MSG correct.stderr_part = "" else: correct.stdout_part = "" correct.stderr_part = INVALID_MSG scraper = PilScraper(filename=correct.filename, mimetype="image/tiff") scraper.scrape_file() if correct.well_formed: for index, _ in enumerate(correct.streams): correct.streams[index]["version"] = UNAV evaluate_scraper(scraper, correct) else: assert not scraper.well_formed assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) assert not scraper.streams
def test_scraper_gif(filename, result_dict, evaluate_scraper): """ Test scraper with gif files. :filename: Test file name :result_dict: Result dict containing the test purpose, parts of expected results of stdout and stderr, and expected streams """ correct = parse_results(filename, "image/gif", result_dict, True) # GIF is an index image if correct.well_formed: correct.streams[0]["samples_per_pixel"] = "1" for stream in correct.streams.values(): stream["version"] = UNAV if correct.well_formed: correct.stdout_part = VALID_MSG correct.stderr_part = "" else: correct.stdout_part = "" correct.stderr_part = INVALID_MSG scraper = PilScraper(filename=correct.filename, mimetype="image/gif") scraper.scrape_file() if correct.well_formed: evaluate_scraper(scraper, correct) else: assert not scraper.well_formed assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) assert not scraper.streams
def test_scraper(filename, result_dict, header, extra_params, evaluate_scraper): """ Write test data and run csv scraping for the file. :filename: Test file name :result_dict: Result dict containing purpose of the test, parts of expected stdout and stderr, and expected streams :header: CSV header line :extra_params: Extra parameters for the scraper (e.g. charset) """ correct = parse_results(filename, "text/csv", result_dict, True) params = { "separator": correct.streams[0]["separator"], "delimiter": correct.streams[0]["delimiter"], "fields": header, "mimetype": MIMETYPE } params.update(extra_params) scraper = CsvScraper(filename=correct.filename, mimetype=MIMETYPE, params=params) scraper.scrape_file() evaluate_scraper(scraper, correct)
def test_scraper(filename, result_dict, params, evaluate_scraper): """ Test scraper. :filename: Test file name :result_dict: Result dict containing test purpose, and parts of expected results of stdout and stderr :params: schematron file as extra parameter """ correct = parse_results(filename, "text/xml", result_dict, True, params) scraper = SchematronScraper(filename=correct.filename, mimetype="text/xml", params=correct.params) scraper.scrape_file() evaluate_scraper(scraper, correct) if "verbose" in correct.params and correct.params["verbose"]: assert not partial_message_included("have been suppressed", scraper.messages()) elif scraper.messages(): assert partial_message_included("have been suppressed", scraper.messages())
def test_encoding_check(filename, charset, is_wellformed, evaluate_scraper): """ Test character encoding validation with brute force. :filename: Test file name :charset: Character encoding :is_wellformed: Expected result of well-formedness """ params = {"charset": charset} correct = parse_results(filename, "text/plain", {}, True, params) scraper = TextEncodingScraper(filename=correct.filename, mimetype="text/plain", params=params) scraper.scrape_file() if not is_wellformed: correct.update_mimetype(UNAV) correct.update_version(UNAV) correct.streams[0]["stream_type"] = UNAV else: correct.update_mimetype("text/plain") correct.update_version(UNAP) correct.streams[0]["stream_type"] = "text" correct.well_formed = is_wellformed if correct.well_formed: correct.stdout_part = "encoding validated successfully" correct.stderr_part = "" evaluate_scraper(scraper, correct) else: assert partial_message_included("decoding error", scraper.errors()) assert scraper.errors() assert not scraper.well_formed
def test_scraper(filename, result_dict, evaluate_scraper): """Test scraper.""" correct = parse_results(filename, MIMETYPE, result_dict, True) scraper = DpxScraper(correct.filename, True, correct.params) scraper.scrape_file() evaluate_scraper(scraper, correct)
def test_mixed_filetype(filename, result_dict, filetype, evaluate_scraper): """ Test scraping files as wrong but supported file type. Some metadata models support many file types. For example, OfficeMagicMeta supports text, spreadsheet and presentation in both MS and open formats, among other file types. A side effect of this is that it is entirely possible to scrape e.g. an ods file as a doc (or xls) file by just forcing the file type the scraper uses, and this does not produce errors and the file is reported as well-formed. Currently this does not cause problems if the user is aware of this functionality, as no metadata scraping results are affected by it. This test can hopefully catch if problematic metadata functions are introduced in the future. """ correct = parse_results(filename, filetype["correct_mimetype"], result_dict, True) correct.update_mimetype(filetype["expected_mimetype"]) correct.update_version(filetype["expected_version"]) if filetype["given_mimetype"]: mimetype_guess = filetype["given_mimetype"] else: mimetype_guess = filetype["correct_mimetype"] params = { "mimetype": filetype["given_mimetype"], "version": filetype["given_version"], "mimetype_guess": mimetype_guess } scraper = MagicScraper(correct.filename, True, params) scraper.scrape_file() evaluate_scraper(scraper, correct)
def test_scraper_invalid(filename, result_dict, params, evaluate_scraper): """ Test scraper with invalid files. :filename: Test file name :result_dict: Result dict containing test purpose, and parts of expected results of stdout and stderr :params: Extra parameters for Scraper """ correct = parse_results(filename, "text/xml", result_dict, True, params) scraper = XmllintScraper(filename=correct.filename, mimetype="text/xml", params=correct.params) scraper.scrape_file() if any(item in filename for item in ["empty", "no_closing_tag", "no_namespace_catalog", "diacritics"]): correct.well_formed = False correct.version = None correct.streams[0]["version"] = None if not correct.well_formed: assert not scraper.well_formed assert not scraper.streams assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) else: evaluate_scraper(scraper, correct) assert not partial_message_included("<note>", scraper.messages())
def test_invalid_text(filename, mimetype): """Test TextFileMagic with invalid files.""" result_dict = { "purpose": "Test invalid file.", "stdout_part": "", "stderr_part": "Unsupported MIME type" } correct = parse_results(filename, mimetype, result_dict, True) params = correct.params params["mimetype_guess"] = correct.mimetype scraper = MagicScraper(correct.filename, True, params) scraper.scrape_file() if "empty" in filename: correct.streams[0]["mimetype"] = "inode/x-empty" correct.mimetype = "inode/x-empty" else: correct.streams[0]["mimetype"] = "application/octet-stream" correct.mimetype = "application/octet-stream" correct.version = None correct.streams[0]["version"] = None correct.streams[0]["charset"] = None assert not scraper.well_formed assert not scraper.streams assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors())
def test_scraper_valid(filename, mimetype, evaluate_scraper): """Test scraper.""" result_dict = { "purpose": "Test valid file.", "stdout_part": "successfully", "stderr_part": "" } correct = parse_results(filename, mimetype, result_dict, True) params = correct.params params["mimetype_guess"] = correct.mimetype scraper = MagicScraper(correct.filename, True, params) scraper.scrape_file() if correct.mimetype == "application/xhtml+xml": correct.streams[0]["stream_type"] = "text" if (OfficeFileMagicMeta.is_supported(correct.mimetype) or HtmlFileMagicMeta.is_supported(correct.mimetype)): correct.version = None correct.streams[0]["version"] = None if correct.mimetype in ["text/plain", "text/csv"]: correct.streams[0]["charset"] = "UTF-8" correct.streams[0]["version"] = "(:unap)" if filename == "valid__iso8859.txt": correct.streams[0]["charset"] = "ISO-8859-15" if mimetype == "text/html" or "vnd." in mimetype or "msword" in mimetype: correct.streams[0]["version"] = "(:unav)" if mimetype == "image/jp2": correct.streams[0]["version"] = "" evaluate_scraper(scraper, correct)
def test_scraper_pdf(filename, result_dict, evaluate_scraper): """ Test Ghostscript scraper. :filename: Test filename. Character X is replaced with versions 1.7, A-1a, A-2b, and A-3b. All of these files must be found. :result_dict: Result dict containing the test purpose, and parts of expected results of stdout and stderr """ for ver in ["1.7", "A-1a", "A-2b", "A-3b"]: filename = filename.replace("X", ver) correct = parse_results(filename, "application/pdf", result_dict, True) scraper = GhostscriptScraper(filename=correct.filename, mimetype="application/pdf") scraper.scrape_file() # Ghostscript cannot handle version or MIME type correct.streams[0]["version"] = UNAV correct.streams[0]["mimetype"] = UNAV evaluate_scraper(scraper, correct, eval_output=False) if scraper.well_formed: assert not partial_message_included("Error", scraper.messages()) assert not scraper.errors() else: assert partial_message_included(correct.stderr_part, scraper.errors()) assert partial_message_included(correct.stdout_part, scraper.messages())
def test_scraper_gif(filename, result_dict, evaluate_scraper): """Test scraper with gif files.""" correct = parse_results(filename, "image/gif", result_dict, True) # GIF is an index image if correct.well_formed: correct.streams[0]["samples_per_pixel"] = "1" for stream in correct.streams.values(): stream["version"] = "(:unav)" if correct.well_formed: correct.stdout_part = VALID_MSG correct.stderr_part = "" else: correct.stdout_part = "" correct.stderr_part = INVALID_MSG scraper = PilScraper(correct.filename, True, correct.params) scraper.scrape_file() if correct.well_formed: evaluate_scraper(scraper, correct) else: assert not scraper.well_formed assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) assert not scraper.streams
def test_invalid_markdown_pdf_arc(filename, mimetype, class_, evaluate_scraper): """Test scrapers for invalid XML, XHTML, HTML, pdf and arc files.""" result_dict = { 'purpose': 'Test invalid file.', 'stdout_part': 'successfully', 'stderr_part': ''} correct = parse_results(filename, mimetype, result_dict, True) scraper = class_(correct.filename, correct.mimetype, True, correct.params) scraper.scrape_file() correct.well_formed = True if 'empty' in filename: correct.streams[0]['mimetype'] = 'inode/x-empty' if class_ == HtmlFileMagic: correct.version = None correct.streams[0]['version'] = None if class_ in [XhtmlFileMagic]: correct.streams[0]['stream_type'] = 'text' if class_ in [HtmlFileMagic, XmlFileMagic, XhtmlFileMagic]: correct.streams[0]['charset'] = 'UTF-8' evaluate_scraper(scraper, correct)
def test_existing_files(filename, mimetype, is_textfile, evaluate_scraper): """ Test detecting whether file is a textfile. The scraper tool is not able to detect UTF-16 files without BOM or UTF-32 files. :filename: Test file name :mimetype: File MIME type :is_textfile: Expected result whether a file is a text file or not """ correct = parse_results(filename, mimetype, {}, True) scraper = TextfileScraper(filename=correct.filename, mimetype="text/plain") scraper.scrape_file() if is_textfile: correct.streams[0]["stream_type"] = "text" correct.update_mimetype("text/plain") correct.streams[0]["version"] = UNAP else: correct.streams[0]["stream_type"] = UNAV correct.update_mimetype(UNAV) correct.streams[0]["version"] = UNAV correct.well_formed = is_textfile if correct.well_formed: correct.stdout_part = VALID_MSG correct.stderr_part = "" evaluate_scraper(scraper, correct) else: assert partial_message_included(INVALID_MSG, scraper.errors()) assert scraper.errors() assert not scraper.well_formed
def test_no_parameters(testpath, evaluate_scraper): """Test scraper without separate parameters.""" with open(os.path.join(testpath, 'valid__.csv'), 'wb') as outfile: outfile.write(VALID_CSV) scraper = CsvScraper(outfile.name) scraper.scrape_file() correct = parse_results( 'valid__.csv', MIMETYPE, { 'purpose': 'Test valid file on default settings.', 'stdout_part': 'successfully', 'stderr_part': '', 'streams': { 0: { 'stream_type': 'text', 'index': 0, 'mimetype': MIMETYPE, 'version': '', 'delimiter': ',', 'separator': '\r\n', 'first_line': ['1997', 'Ford', 'E350', 'ac, abs, moon', '3000.00'] } } }, True) correct.streams[0]['version'] = "(:unap)" evaluate_scraper(scraper, correct)
def test_jpeg2000_inside_pdf(evaluate_scraper): """ Test scraping a pdf file containing JPEG2000 image. Default Ghostscript installation on CentOS 7 does not support pdf files containing JPXDecode data. This test verifies that the installed version is recent enough. """ filename = "valid_1.7_jpeg2000.pdf" mimetype = "application/pdf" result_dict = { "purpose": "Test pdf with JPEG2000 inside it.", "stdout_part": "Well-formed and valid", "stderr_part": "" } correct = parse_results(filename, mimetype, result_dict, True) scraper = GhostscriptScraper(correct.filename, True) scraper.scrape_file() # Ghostscript cannot handle version or MIME type correct.version = "(:unav)" correct.streams[0]["version"] = "(:unav)" correct.mimetype = "(:unav)" correct.streams[0]["mimetype"] = "(:unav)" evaluate_scraper(scraper, correct, eval_output=False)
def test_gzip_scraper(filename, result_dict, evaluate_scraper): """ Test scraper for gzip files. :filename: Test file name :result_dict: Result dict containing test purpose, and parts of expected results of stdout and stderr """ mime = "application/warc" classname = "WarctoolsFullScraper" correct = parse_results(filename, mime, result_dict, True) scraper = GzipWarctoolsScraper(filename=correct.filename, mimetype="application/gzip") scraper.scrape_file() if not correct.well_formed and correct.streams[0]["version"] == UNAV: correct.update_mimetype("application/gzip") classname = "GzipWarctoolsScraper" if not correct.well_formed: assert not scraper.well_formed assert not scraper.streams assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) else: evaluate_scraper(scraper, correct, exp_scraper_cls=classname)
def test_scraper(testpath, csv_text, result_dict, prefix, header, evaluate_scraper, extra_params): """ Write test data and run csv scraping for the file. NB: Forcing unsupported MIME type causes an error to be logged, resulting in the file being reported as not well-formed regardless of its contents. """ with open(os.path.join(testpath, '%s.csv' % prefix), 'wb') as outfile: outfile.write(csv_text) mimetype = result_dict['streams'][0]['mimetype'] version = result_dict['streams'][0]['version'] words = outfile.name.rsplit('/', 1) correct = parse_results(words[1], '', result_dict, True, basepath=words[0]) correct.update_mimetype(mimetype) correct.update_version(version) if mimetype != 'text/csv': correct.well_formed = False params = { 'separator': correct.streams[0]['separator'], 'delimiter': correct.streams[0]['delimiter'], 'fields': header } params.update(extra_params) scraper = CsvScraper(correct.filename, True, params=params) scraper.scrape_file() evaluate_scraper(scraper, correct)
def test_mediainfo_scraper_wav(filename, result_dict, evaluate_scraper): """ Test WAV scraping with Mediainfo. :filename: Test file name :result_dict: Result dict containing the test purpose, parts of expected results of stdout and stderr, and expected streams """ mimetype = "audio/x-wav" correct = parse_results(filename, mimetype, result_dict, False) if "2" in filename: correct.streams[0]["version"] = "2" else: correct.streams[0]["version"] = UNAP scraper = MediainfoScraper(filename=correct.filename, mimetype=mimetype) scraper.scrape_file() if "empty" in filename: assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) assert not scraper.streams else: evaluate_scraper(scraper, correct)
def test_mediainfo_scraper_mov(filename, result_dict, mimetype, evaluate_scraper): """ Test Quicktime and DV scraping with Mediainfo. :filename: Test file name :result_dict: Result dict containing the test purpose, parts of expected results of stdout and stderr, and expected streams :mimetype: File MIME type """ correct = parse_results(filename, mimetype, result_dict, False) scraper = MediainfoScraper(filename=correct.filename, mimetype=mimetype) scraper.scrape_file() if filename == "valid__h264_aac_no_ftyp_atom.mov": correct.streams[0]["codec_name"] = "QuickTime" if ".dv" in filename: correct.streams[0].pop("stream_type", None) if "empty" in filename: assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) assert not scraper.streams else: evaluate_scraper(scraper, correct)
def test_gzip_scraper(filename, result_dict, evaluate_scraper): """Test scraper.""" if "warc" in filename: mime = "application/warc" classname = "WarcWarctoolsScraper" else: mime = "application/x-internet-archive" classname = "ArcWarctoolsScraper" correct = parse_results(filename, mime, result_dict, True) scraper = GzipWarctoolsScraper(correct.filename, True, correct.params) scraper.scrape_file() if correct.version == "" or correct.mimetype == \ "application/x-internet-archive": correct.version = None correct.streams[0]["version"] = "(:unav)" if not correct.well_formed and correct.version is None: correct.mimetype = "application/gzip" correct.streams[0]["mimetype"] = "application/gzip" classname = "GzipWarctoolsScraper" if not correct.well_formed: assert not scraper.well_formed assert not scraper.streams assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) else: evaluate_scraper(scraper, correct, exp_scraper_cls=classname)
def test_mediainfo_scraper_mkv(filename, result_dict, evaluate_scraper): """ Test Matroska scraping with Mediainfo. :filename: Test file name :result_dict: Result dict containing the test purpose, parts of expected results of stdout and stderr, and expected streams """ mimetype = "video/x-matroska" correct = parse_results(filename, mimetype, result_dict, False) scraper = MediainfoScraper(filename=correct.filename, mimetype=mimetype) scraper.scrape_file() if "empty" in filename: correct.version = None correct.streams[0]["version"] = None correct.streams[0]["stream_type"] = None if "invalid" in filename: assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) assert not scraper.streams else: evaluate_scraper(scraper, correct)
def test_scraper_valid(filename, mimetype, charset, scraper_class, evaluate_scraper): """Test scraper.""" result_dict = { "purpose": "Test valid file.", "stdout_part": "successfully", "stderr_part": "" } correct = parse_results(filename, mimetype, result_dict, True, {"charset": charset}) office_unav_version_mimes = [ "application/vnd.oasis.opendocument.text", "application/vnd.oasis.opendocument.spreadsheet", "application/vnd.oasis.opendocument.presentation", "application/vnd.oasis.opendocument.graphics", "application/vnd.oasis.opendocument.formula", ] scraper = scraper_class(filename=correct.filename, mimetype=mimetype, params={"charset": charset}) scraper.scrape_file() if correct.streams[0]["mimetype"] == "application/xhtml+xml": correct.streams[0]["stream_type"] = "text" if ((correct.streams[0]["mimetype"] in office_unav_version_mimes) or HtmlFileMagicMeta.is_supported(correct.streams[0]["mimetype"])): correct.streams[0]["version"] = UNAV evaluate_scraper(scraper, correct)
def test_no_parameters(filename, evaluate_scraper): """ Test scraper without separate parameters. :filename: Test file name """ correct = parse_results( filename, MIMETYPE, { "purpose": "Test valid file on default settings.", "stdout_part": "successfully", "stderr_part": "", "streams": { 0: { "stream_type": "text", "index": 0, "mimetype": MIMETYPE, "version": UNAP, "delimiter": ",", "separator": "\r\n", "quotechar": "\"", "first_line": ["year", "brand", "model", "detail", "other"] } } }, True) scraper = CsvScraper(correct.filename, mimetype="text/csv") scraper.scrape_file() evaluate_scraper(scraper, correct)
def test_mediainfo_scraper_avi(filename, result_dict): """ Test AVI scraping with Mediainfo. Both Mediainfo and FFMpeg cannot be used for metadata scraping, and FFMpeg meets our needs better with AVI, so MediainfoScraper should just return one stream full of unavs to be overwritten by results from FFMpeg. """ mimetype = "video/avi" correct = parse_results(filename, mimetype, result_dict, True) scraper = MediainfoScraper(correct.filename, True, params={"mimetype_guess": mimetype}) scraper.scrape_file() assert partial_message_included(correct.stdout_part, scraper.messages()) assert partial_message_included(correct.stderr_part, scraper.errors()) if "invalid" in filename: assert not scraper.streams else: assert len(scraper.streams) == 1 for method in scraper.streams[0].iterate_metadata_methods(): if method.__name__ == "index": assert method() == 0 else: assert method() == "(:unav)"
def test_scraper_valid(filename, result_dict, params, evaluate_scraper): """Test scraper.""" correct = parse_results(filename, 'text/xml', result_dict, True, params) scraper = Xmllint(correct.filename, correct.mimetype, True, correct.params) scraper.scrape_file() evaluate_scraper(scraper, correct) assert '<note>' not in scraper.messages()
def test_scraper_tiff(filename, result_dict, evaluate_scraper): """Test tiff scraping.""" correct = parse_results(filename, "image/tiff", result_dict, True) scraper = JHoveTiffScraper(correct.filename, True, correct.params) scraper.scrape_file() correct.version = "6.0" correct.streams[0]["version"] = "6.0" evaluate_scraper(scraper, correct)
def test_scraper_jpeg(filename, result_dict, evaluate_scraper): """Test jpeg scraping.""" correct = parse_results(filename, "image/jpeg", result_dict, True) scraper = JHoveJpegScraper(correct.filename, True, correct.params) scraper.scrape_file() correct.version = "(:unav)" correct.streams[0]["version"] = "(:unav)" evaluate_scraper(scraper, correct)
def test_scraper(filename, result_dict, evaluate_scraper): """Test scraper.""" correct = parse_results(filename, MIMETYPE, result_dict, True) scraper = PsppScraper(correct.filename, True, correct.params) scraper.scrape_file() correct.streams[0]["mimetype"] = "(:unav)" evaluate_scraper(scraper, correct)
def test_scraper(filename, result_dict, evaluate_scraper): """Test scraper.""" correct = parse_results(filename, MIMETYPE, result_dict, True) scraper = Vnu(correct.filename, correct.mimetype, True, correct.params) scraper.scrape_file() correct.version = '5.0' correct.streams[0]['version'] = '5.0' evaluate_scraper(scraper, correct)