def test_detect_filetype(filename, params, expected_results): """ Test running only the filetype detection. This test ensures that the filetype detection fills in mimetype and version (if available from detectors) for the file, leaving well_formed and streams as None. Info should also contain some entries, but their contents are not checked. Then it is tested that the same results are also returned if full scraping is run before filetype detection. :filename: Test file name :params: Parameters for Scarper :expected_results: Expected results, containing expected values of Scraper attributes """ # Filetype detection should work without scraping scraper = Scraper(filename, **params) scraper.detect_filetype() for field, value in expected_results.items(): assert getattr(scraper, field) == value assert scraper.streams is None assert scraper.info # Even if scraping has been done previously, detection should erase all # streams and other information scraper.scrape() scraper.detect_filetype() for field, value in expected_results.items(): assert getattr(scraper, field) == value assert scraper.streams is None assert scraper.info
def test_charset_parameter(charset): """ Test charset parameter. In the test we have an UTF-8 file. If given charset is None, it will be detected as UTF-8. Otherwise, the parameter value is used. :charset: Given character encoding """ scraper = Scraper("tests/data/text_plain/valid__utf8_without_bom.txt", charset=charset) scraper.detect_filetype() # pylint: disable=protected-access assert scraper._params["charset"] in [charset, "UTF-8"]
def check_well_formed(metadata_info, catalog_path): """ Check if file is well formed. If mets specifies an alternative format or scraper identifies the file as something else than what is given in mets, add a message specifying the alternative mimetype and version. Validate file as the mimetype given in mets. :param metadata_info: Dictionary containing metadata parsed from mets. :param catalog_path: Schema XML catalog path to pass to file-scraper. :returns: Tuple with 2 dicts: (result_dict, scraper.streams) """ messages = [] valid_only_messages = [] md_mimetype = metadata_info['format']['mimetype'] md_version = metadata_info['format']['version'] force_mimetype = False if 'alt-format' in metadata_info['format']: messages.append( append_format_info('METS alternative ', metadata_info['format']['alt-format'])) force_mimetype = True else: scraper = Scraper(metadata_info['filename']) (mime, version) = scraper.detect_filetype() if mime != md_mimetype or version != md_version: messages.append(append_format_info('Detected ', mime, version)) force_mimetype = True scraper_mimetype = None scraper_version = None if force_mimetype: scraper_mimetype = md_mimetype scraper_version = md_version messages.append(append_format_info('METS ', md_mimetype, md_version)) messages.append( append_format_info('Validating as ', md_mimetype, md_version)) valid_only_messages.append( append_format_info('The digital object will be preserved as ', md_mimetype, md_version)) scraper = Scraper(metadata_info['filename'], mimetype=scraper_mimetype, version=scraper_version, catalog_path=catalog_path, **create_scraper_params(metadata_info)) scraper.scrape() scraper_info = get_scraper_info(scraper) messages.extend(scraper_info['messages']) return (make_result_dict(is_valid=scraper.well_formed, messages=messages, errors=scraper_info['errors'], extensions=scraper_info['extensions'], valid_only_messages=valid_only_messages), scraper.streams)