def test_invalid_combined(fullname, mimetype, version): """ Integration test for all invalid files. - Test that well_formed is False or None and mimetype is expected. - If well_formed is None, check that Scraper was not found. - Skip files that are known cases where it is identified differently (but yet correctly) than expected and would be well-formed. - Skip empty files, since those are detected as inode/x-empty and scraper is not found. """ if "empty" in fullname or fullname in IGNORE_INVALID: pytest.skip("[%s] has empty or in invalid ignore" % fullname) predefined_mimetype = GIVEN_MIMETYPES.get(fullname, None) predefined_charset = GIVEN_CHARSETS.get(fullname, None) scraper = Scraper(fullname, mimetype=predefined_mimetype, charset=predefined_charset) scraper.scrape() for _, info in iteritems(scraper.info): if scraper.mimetype != mimetype and info["class"] == "ScraperNotFound": pytest.skip(("[%s] mimetype mismatches with scraper " "and scraper not found") % fullname) assert not scraper.well_formed # Should return either False or None assert scraper.mimetype == mimetype or (fullname in UNAV_MIMETYPE_INVALID and scraper.mimetype == UNAV)
def test_detect_filetype(filename, params, expected_results): """ Test running only the filetype detection. This test ensures that the filetype detection fills in mimetype and version (if available from detectors) for the file, leaving well_formed and streams as None. Info should also contain some entries, but their contents are not checked. Then it is tested that the same results are also returned if full scraping is run before filetype detection. :filename: Test file name :params: Parameters for Scarper :expected_results: Expected results, containing expected values of Scraper attributes """ # Filetype detection should work without scraping scraper = Scraper(filename, **params) scraper.detect_filetype() for field, value in expected_results.items(): assert getattr(scraper, field) == value assert scraper.streams is None assert scraper.info # Even if scraping has been done previously, detection should erase all # streams and other information scraper.scrape() scraper.detect_filetype() for field, value in expected_results.items(): assert getattr(scraper, field) == value assert scraper.streams is None assert scraper.info
def test_grading(fullname, mimetype, version): """Test grading for a valid test file. Test that file format is graded as recommended unless the file is explicitly listed as acceptable or unacceptable. """ if fullname in UNAV_VERSION: pytest.skip( "File format version of file {} can not be defined.".format( fullname)) charset = GIVEN_CHARSETS.get(fullname, None) scraper = Scraper(fullname, mimetype=mimetype, version=version, charset=charset) scraper.scrape() if fullname in UNACCEPTABLE_FILES: expected_grade = UNACCEPTABLE elif fullname in BIT_LEVEL_FILES: expected_grade = BIT_LEVEL elif fullname in BIT_LEVEL_WITH_RECOMMENDED_FILES: expected_grade = BIT_LEVEL_WITH_RECOMMENDED elif fullname in ACCEPTABLE_FILES: expected_grade = ACCEPTABLE else: expected_grade = RECOMMENDED assert scraper.grade() == expected_grade
def test_without_wellformed(fullname, mimetype): """Test the case where metadata is collected without well-formedness check. - Test that well-formed is always None. - Test that mimetype matches. - Test that there exists correct stream type for image, video, audio and text. - Test a random element existence for image, video, audio and text. """ if fullname in IGNORE_FOR_METADATA: pytest.skip('[%s] in ignore' % fullname) scraper = Scraper(fullname) scraper.scrape(False) _assert_valid_scraper_result(scraper, fullname, mimetype, False) mimepart = mimetype.split("/")[0] assert mimepart not in ['image', 'video', 'text', 'audio'] or \ mimepart in scraper.streams[0]['stream_type'] elem_dict = { 'image': 'colorspace', 'video': 'color', 'videocontainer': 'codec_name', 'text': 'charset', 'audio': 'num_channels' } for _, stream in iteritems(scraper.streams): assert stream['stream_type'] is not None if stream['stream_type'] in elem_dict: assert elem_dict[stream['stream_type']] in stream if 'text/csv' in mimetype: assert 'delimiter' in scraper.streams[0]
def test_without_wellformed(fullname, mimetype): """Test the case where metadata is collected without well-formedness check. - Test that well-formed is always None. - Test that mimetype matches. - Test that there exists correct stream type for image, video, audio and text. - Test a random element existence for image, video, audio and text. """ if fullname in IGNORE_FOR_METADATA: pytest.skip("[%s] in ignore" % fullname) scraper = Scraper(fullname) scraper.scrape(False) _assert_valid_scraper_result(scraper, fullname, mimetype, False) mimepart = mimetype.split("/")[0] if mimepart in ["image", "video", "text", "audio"]: assert mimepart in scraper.streams[0]["stream_type"] elem_dict = { "image": "colorspace", "video": "color", "videocontainer": "codec_name", "text": "charset", "audio": "num_channels" } for stream in scraper.streams.values(): assert stream["stream_type"] is not None if stream["stream_type"] in elem_dict: assert elem_dict[stream["stream_type"]] in stream if "text/csv" in mimetype: assert "delimiter" in scraper.streams[0]
def _validate_file(file_, cache_path, errors): """Validate file using file-scraper. :param file_: file metadata :param mongo_file: file data in mongo :param cache_path: Path to the file_cache :param errors: array to store non-valid files :returns: None """ identifier = file_["identifier"] file_chars = file_["file_characteristics"] mimetype = file_chars["file_format"] encoding = file_chars.get("encoding", None) version = file_chars.get("format_version", None) filepath = os.path.join(cache_path, identifier) scraper = Scraper( filepath, mimetype=mimetype, charset=encoding, version=version ) scraper.scrape(check_wellformed=True) if not scraper.well_formed: errors.append(identifier) del scraper
def scrape_file(filename, filerel=None, workspace=None): """Return already existing scraping result or create a new one, if missing. """ if filerel is None: filerel = filename ref_exists = False if workspace is not None: ref = os.path.join(workspace, 'md-references.xml') if os.path.isfile(ref): ref_exists = True if ref_exists: root = lxml.etree.parse(ref).getroot() filerel = fsdecode_path(filerel) amdref = root.xpath("/mdReferences/mdReference[not(@stream) " "and @file='%s']" % filerel) pkl_name = None if amdref: pkl_name = os.path.join( workspace, '{}-scraper.pkl'.format(amdref[0].text[1:])) if pkl_name and os.path.isfile(pkl_name) and amdref: with open(pkl_name, 'rb') as pkl_file: return pickle.load(pkl_file) scraper = Scraper(filename) scraper.scrape(False) return scraper.streams
def test_missing_scraper(fullname, mimetype): """Integration test with missing scraper. - Scraper is missing for the HTML files due to missing doctype. """ scraper = Scraper(fullname) scraper.scrape() assert scraper.info[len(scraper.info) - 1]['class'] == 'ScraperNotFound' assert scraper.well_formed is None
def test_missing_file(): """Test missing file.""" scraper = Scraper("missing_file") scraper.scrape() assert not scraper.well_formed scraper = Scraper(None) scraper.scrape() assert not scraper.well_formed
def check_well_formed(metadata_info, catalog_path): """ Check if file is well formed. If mets specifies an alternative format or scraper identifies the file as something else than what is given in mets, add a message specifying the alternative mimetype and version. Validate file as the mimetype given in mets. :param metadata_info: Dictionary containing metadata parsed from mets. :param catalog_path: Schema XML catalog path to pass to file-scraper. :returns: Tuple with 2 dicts: (result_dict, scraper.streams) """ messages = [] valid_only_messages = [] md_mimetype = metadata_info['format']['mimetype'] md_version = metadata_info['format']['version'] force_mimetype = False if 'alt-format' in metadata_info['format']: messages.append( append_format_info('METS alternative ', metadata_info['format']['alt-format'])) force_mimetype = True else: scraper = Scraper(metadata_info['filename']) (mime, version) = scraper.detect_filetype() if mime != md_mimetype or version != md_version: messages.append(append_format_info('Detected ', mime, version)) force_mimetype = True scraper_mimetype = None scraper_version = None if force_mimetype: scraper_mimetype = md_mimetype scraper_version = md_version messages.append(append_format_info('METS ', md_mimetype, md_version)) messages.append( append_format_info('Validating as ', md_mimetype, md_version)) valid_only_messages.append( append_format_info('The digital object will be preserved as ', md_mimetype, md_version)) scraper = Scraper(metadata_info['filename'], mimetype=scraper_mimetype, version=scraper_version, catalog_path=catalog_path, **create_scraper_params(metadata_info)) scraper.scrape() scraper_info = get_scraper_info(scraper) messages.extend(scraper_info['messages']) return (make_result_dict(is_valid=scraper.well_formed, messages=messages, errors=scraper_info['errors'], extensions=scraper_info['extensions'], valid_only_messages=valid_only_messages), scraper.streams)
def test_grade(file_path, expected_grade): """Test that scraper returns correct digital preservation grade.""" scraper = Scraper(file_path) # File can not be graded before scraping assert scraper.grade() == "(:unav)" # After scraping the file should have expected grade scraper.scrape() assert scraper.grade() == expected_grade
def scrape_file(filepath, filerel=None, workspace=None, mimetype=None, version=None, charset=None, skip_well_check=False, skip_json=False): """ Return already existing scraping result or create a new one, if missing. :filepath: Digital object path :filerel: Digital object path relative to base path :workspace: Workspace path :mimetype: MIME type of digital object :version: File format version of digital object :charset: Encoding of digital object (if text file) :skip_well_check: True skips well-formedness checking :skip_json: True does scraping and does not try to find JSON file :returns: Metadata dict of streams and scraper info as a tuple :raises: ValueError If metadata collecting fails. IOError If file does not exist. """ filerel = filepath if filerel is None else filerel streams = None if not skip_json: streams = read_json_streams(filerel, workspace) if streams is not None: return (streams, None) scraper = Scraper(filepath, mimetype=mimetype, version=version, charset=charset) scraper.scrape(not skip_well_check) if scraper.well_formed is False: # Must not be None errors = [] for _, info in six.iteritems(scraper.info): errors.append("\n".join(info['errors'])) error_str = "\n".join(errors) if skip_well_check: error_head = "Metadata of file %s could not " \ "be collected due to errors.\n" % filepath error_str = error_head + error_str raise ValueError(six.text_type(error_str)) if scraper.info[0]['class'] == 'FileExists' and scraper.info[0]['errors']: raise IOError(scraper.info[0]['errors']) for _, info in six.iteritems(scraper.info): if info['class'] == 'ScraperNotFound': raise ValueError('File format is not supported.') return (scraper.streams, scraper.info)
def test_missing_file(): """Test missing file.""" scraper = Scraper("missing_file", mimetype="application/pdf") scraper.scrape() assert not scraper.well_formed assert len(scraper.info) == 1 and scraper.info[0]["class"] == "FileExists" scraper = Scraper(None, mimetype="application/pdf") scraper.scrape() assert not scraper.well_formed assert len(scraper.info) == 1 and scraper.info[0]["class"] == "FileExists"
def test_without_wellformed(fullname, mimetype, version): """ Test the case where metadata is collected without well-formedness check. - Test that well-formed is always None. - Test that mimetype and version matches. - Test that there exists correct stream type for image, video, audio and text. - Test a random element existence for image, video, audio and text. - Test that giving the resulted MIME type, version and charset produce the same results. """ if fullname in IGNORE_FOR_METADATA: pytest.skip("[%s] in ignore" % fullname) predefined_mimetype = GIVEN_MIMETYPES.get(fullname, None) predefined_charset = GIVEN_CHARSETS.get(fullname, None) scraper = Scraper(fullname, mimetype=predefined_mimetype, charset=predefined_charset) scraper.scrape(False) _assert_valid_scraper_result(scraper, fullname, mimetype, version, None) mimepart = mimetype.split("/")[0] if mimepart in ["image", "video", "text", "audio"]: assert mimepart in scraper.streams[0]["stream_type"] elem_dict = { "image": "colorspace", "video": "color", "videocontainer": "codec_name", "text": "charset", "audio": "num_channels" } for stream in scraper.streams.values(): assert stream["stream_type"] not in [UNAV, None] if stream["stream_type"] in elem_dict: assert elem_dict[stream["stream_type"]] in stream # Test that output does not change if MIME type and version are given # to be the ones scraper would determine them to be in any case. given_scraper = Scraper(fullname, mimetype=scraper.mimetype, version=scraper.version, charset=scraper.streams[0].get("charset", None)) given_scraper.scrape(False) assert given_scraper.mimetype == scraper.mimetype assert given_scraper.version == scraper.version assert given_scraper.streams == scraper.streams assert given_scraper.well_formed == scraper.well_formed
def scrape_file(ctx, filename, check_wellformed, tool_info, mimetype, version): """ Identify file type, collect metadata, and optionally check well-formedness. In addition to the given options, the user can provide any extra options that are passed onto the scraper. These options must be in the long form, e.g. "--charset=UTF-8" or "--charset UTF-8". \f :ctx: Context object :filename: Path to the file that should be scraped :check_wellformed: Flag whether the scraper checks wellformedness :tool_info: Flag whether the scraper includes messages from different 3rd party tools :mimetype: Specified mimetype for the scraped file :version: Specified version for the scraped file """ scraper = Scraper(filename, mimetype=mimetype, version=version, **_extra_options_to_dict(ctx.args)) scraper.scrape(check_wellformed=check_wellformed) results = { "path": ensure_text(scraper.filename), "MIME type": ensure_text(scraper.mimetype), "version": ensure_text(scraper.version), "metadata": scraper.streams, "grade": scraper.grade() } if check_wellformed: results["well-formed"] = scraper.well_formed if tool_info: results["tool_info"] = scraper.info errors = {} for item in scraper.info.values(): if "ScraperNotFound" in item["class"]: raise click.ClickException("Proper scraper was not found. The " "file was not analyzed.") if item["errors"]: errors[item["class"]] = item["errors"] if errors: results["errors"] = errors click.echo(json.dumps(results, indent=4))
def test_given_filetype(filepath, params, well_formed, expected_mimetype, expected_version, expected_charset, meta_well_formed): """ Test the scraping to be done as user given file type. MIME type and version results are checked both directly from the scraper and for well-formed files also from the first stream. In addition to this, well-formedness status of the file should be as expected. :filepath: Test file path :params: Parameters for Scraper :well_formed: Expected result of well-formedness :expected_mimetype: Expected MIME type result :exprected_version: Expected file format version """ scraper = Scraper(filename=filepath, **params) scraper.scrape() assert scraper.well_formed == well_formed assert scraper.mimetype == expected_mimetype assert scraper.version == expected_version if expected_charset: assert scraper.streams[0]["charset"] == expected_charset else: assert "charset" not in scraper.streams[0] assert scraper.streams[0]["mimetype"] == expected_mimetype assert scraper.streams[0]["version"] == expected_version # Just collect metadata without well-formedness checking # WARC files can not be scraped without well-formedness check if expected_mimetype == "application/warc": return scraper = Scraper(filename=filepath, **params) scraper.scrape(False) assert scraper.well_formed == meta_well_formed assert scraper.mimetype == expected_mimetype assert scraper.version == expected_version if expected_charset: assert scraper.streams[0]["charset"] == expected_charset else: assert "charset" not in scraper.streams[0] assert scraper.streams[0]["mimetype"] == expected_mimetype assert scraper.streams[0]["version"] == expected_version
def test_coded_filename(testpath, fullname, mimetype): """Integration test with unicode and utf-8 filename and with all scrapers. - Test that unicode filenames work with all mimetypes - Test that utf-8 encoded filenames work with all mimetypes """ if fullname in IGNORE_VALID + ["tests/data/text_xml/valid_1.0_dtd.xml"]: pytest.skip("[%s] in ignore" % fullname) ext = fullname.rsplit(".", 1)[-1] unicode_name = os.path.join(testpath, "äöå.%s" % ext) shutil.copy(fullname, unicode_name) scraper = Scraper(unicode_name) scraper.scrape() assert scraper.well_formed scraper = Scraper(unicode_name.encode("utf-8")) scraper.scrape() assert scraper.well_formed
def test_charset(filepath, charset, well_formed): """ Test charset parameter. We are able to give charset as a parameter. This tests the parameter with different mimetypes and charset inputs. :filepath: Test file path :charset: Given and expected character encoding of a test file :well_formed: Expected result of well-formedness """ predefined_mimetype = GIVEN_MIMETYPES.get(filepath, None) scraper = Scraper(filepath, mimetype=predefined_mimetype, charset=charset) scraper.scrape() assert scraper.well_formed == well_formed assert scraper.streams[0]["charset"] == charset
def test_valid_combined(fullname, mimetype): """Integration test for valid files. - Test that mimetype matches. - Test Find out all None elements. - Test that errors are not given. - Test that all files are well-formed. - Test that forcing the scraper to use the MIME type and version the file actually as does not affect scraping results. - Ignore few files because of required parameter or missing scraper. """ if fullname in IGNORE_VALID: pytest.skip("[%s] in ignore" % fullname) scraper = Scraper(fullname) scraper.scrape() for _, info in iteritems(scraper.info): assert not info["errors"] _assert_valid_scraper_result(scraper, fullname, mimetype, True) # Test that output does not change if MIME type and version are forced # to be the ones scraper would determine them to be in any case. # This cannot be done with compressed arcs, as WarctoolsScraper reports # the MIME type of the compressed archive instead of application/gzip, # so for those types, all required testing is already done here. if (scraper.mimetype in ["application/x-internet-archive"] and fullname[-3:] == ".gz"): return # Forced version affects all frames within a gif or a tiff if scraper.mimetype in ["image/gif", "image/tiff"]: for _, stream in iteritems(scraper.streams): if "version" in stream.keys(): stream["version"] = scraper.streams[0]["version"] forced_scraper = Scraper(fullname, mimetype=scraper.mimetype, version=scraper.version) forced_scraper.scrape() assert forced_scraper.mimetype == scraper.mimetype assert forced_scraper.version == scraper.version assert forced_scraper.streams == scraper.streams
def test_valid_combined(fullname, mimetype): """Integration test for valid files. - Test that mimetype matches. - Test Find out all None elements. - Test that errors are not given. - Test that all files are well-formed. - Ignore few files because of required parameter or missing scraper. """ if fullname in IGNORE_VALID: pytest.skip('[%s] in ignore' % fullname) scraper = Scraper(fullname) scraper.scrape() for _, info in iteritems(scraper.info): assert not info['errors'] _assert_valid_scraper_result(scraper, fullname, mimetype, True)
def main(arguments=None): """Main loop""" usage = "usage: %prog [options] xml-file-name" catalog_path = ("/etc/xml/dpres-xml-schemas/schema_catalogs") schema_path = ("/etc/xml/dpres-xml-schemas/schema_catalogs/schemas") parser = optparse.OptionParser(usage=usage) parser.add_option("-c", "--catalog", dest="catalogpath", default=os.path.join( catalog_path, "catalog_main.xml"), help="Full path to XML catalog file", metavar="FILE") parser.add_option("-s", "--schemapath", dest="schemapath", default=os.path.join(schema_path, "mets/mets.xsd"), help="XML schema filename for validation", metavar="PATH") (options, args) = parser.parse_args(arguments) if len(args) != 1: parser.error("Must give XML filename as argument") filename = args[0] scraper = Scraper(filename, schema=options.schemapath, catalog_path=options.catalogpath, mimetype="text/xml", version="1.0", charset="UTF-8") messages, errors = [], [] scraper.scrape() info = get_scraper_info(scraper) messages.extend(info['messages']) errors.extend(info['errors']) if messages: print(ensure_text(concat(messages)), file=sys.stdout) if errors: print(ensure_text(concat(errors)), file=sys.stderr) if errors or not scraper.well_formed: return 117 return 0
def test_forced_filetype(filepath, params, well_formed, expected_mimetype, expected_version): """ Test forcing the scraping to be done as specific file type. MIME type and version results are checked both directly from the scraper and for well-formed files also from the first stream. In addition to this, well-formedness status of the file should be as expected. """ scraper = Scraper(filepath, **params) scraper.scrape() assert scraper.well_formed == well_formed assert scraper.mimetype == expected_mimetype assert scraper.version == expected_version if well_formed: assert scraper.streams[0]["mimetype"] == expected_mimetype assert scraper.streams[0]["version"] == expected_version
def _scrape_file(self, filepath, skip_well_check): """Scrape file :filepath: Path to file to be scraped :skip_well_check: True, if well-formed check is skipped :returns: scraper with result attributes """ scraper = Scraper(filepath) if not skip_well_check: scraper.scrape(True) if not scraper.well_formed: errors = [] for _, info in six.iteritems(scraper.info): if len(info['errors']) > 0: errors.append(info['errors']) error_str = "\n".join(errors) raise ValueError(error_str) else: scraper.scrape(False) return scraper
def test_valid_combined(fullname, mimetype, version): """ Integration test for valid files. - Test that mimetype and version matches. - Test Find out all None elements. - Test that errors are not given. - Test that all files are well-formed. - Ignore few files because of required parameter or missing scraper. - Test that giving the resulted MIME type, version and charset produce the same results. """ if fullname in IGNORE_VALID: pytest.skip("[%s] in ignore" % fullname) predefined_mimetype = GIVEN_MIMETYPES.get(fullname, None) predefined_charset = GIVEN_CHARSETS.get(fullname, None) scraper = Scraper(fullname, mimetype=predefined_mimetype, charset=predefined_charset) scraper.scrape() for _, info in iteritems(scraper.info): assert not info["errors"] _assert_valid_scraper_result(scraper, fullname, mimetype, version, True) # Test that output does not change if MIME type and version are given # to be the ones scraper would determine them to be in any case. given_scraper = Scraper(fullname, mimetype=scraper.mimetype, version=scraper.version, charset=scraper.streams[0].get("charset", None)) given_scraper.scrape() assert given_scraper.mimetype == scraper.mimetype assert given_scraper.version == scraper.version assert given_scraper.streams == scraper.streams assert given_scraper.well_formed == scraper.well_formed
def _scrape_file(self, filepath, skip_well_check, file_format=None, charset=None): """Scrape file :filepath: Path to file to be scraped :skip_well_check: True, if well-formed check is skipped :file_format: File format and version from the command line argument parser, originally given as a value pair by the user. The mimetype is in index 0 and version in index 1. :charset: Character encoding from arguments :returns: scraper with result attributes """ if file_format in [None, ()]: mimetype = None version = None else: mimetype = file_format[0] version = file_format[1] scraper = Scraper(filepath, mimetype=mimetype, version=version, charset=charset) if not skip_well_check: scraper.scrape(True) if not scraper.well_formed: errors = [] for _, info in six.iteritems(scraper.info): if len(info['errors']) > 0: for error in info['errors']: errors.append(error) error_str = "\n".join(errors) raise ValueError(error_str) else: scraper.scrape(False) return scraper
def test_invalid_combined(fullname, mimetype): """Integration test for all invalid files. - Test that well_formed is False and mimetype is expected. - If well_formed is None, check that Scraper was not found. - Skip files that are known cases where it is identified differently (but yet correctly) than expected and would be well-formed. - Skip empty files, since those are detected as inode/x-empty and scraper is not found. """ if 'empty' in fullname or fullname in IGNORE_INVALID: pytest.skip('[%s] has empty or in invalid ignore' % fullname) scraper = Scraper(fullname) scraper.scrape() for _, info in iteritems(scraper.info): if scraper.mimetype != mimetype and info['class'] == 'ScraperNotFound': pytest.skip(('[%s] mimetype mismatches with scraper ' 'and scraper not found') % fullname) assert scraper.well_formed is False # Could be also None (wrong) assert scraper.mimetype == mimetype or (fullname in DIFFERENT_MIMETYPE_INVALID)
def test_empty_file(): """Test empty file.""" scraper = Scraper("test/data/text_plain/invalid__empty.txt") scraper.scrape() assert not scraper.well_formed