def validate_bag(bag_path, fast=False, callback=None, config_file=bdbag.DEFAULT_CONFIG_FILE): config = read_config(config_file) bag_config = config['bag_config'] bag_processes = bag_config.get('bag_processes', 1) try: logger.info("Validating bag: %s" % bag_path) bag = bagit.Bag(bag_path) bag.validate(bag_processes if not callback else 1, fast=fast, callback=callback) logger.info("Bag %s is valid" % bag_path) except bagit.BagIncompleteError as e: logger.warning( "BagIncompleteError: %s %s", e, "This validation error may be transient if the bag contains unresolved remote file references " "from a fetch.txt file. In this case the bag is incomplete but not necessarily invalid. " "Resolve remote file references (if any) and re-validate.") raise e except bagit.BagValidationError as e: errors = list() for d in e.details: errors.append(bdbag.get_named_exception(d)) raise bagit.BagValidationError('\nError: '.join(errors)) except bagit.InterruptedError as e: logger.warn(bdbag.get_named_exception(e)) raise e except Exception as e: raise RuntimeError("Unhandled exception while validating bag: %s" % e)
def check_payload_consistency(bag, skip_remote=False, quiet=False): only_in_manifests, only_on_fs, only_in_fetch = bag.compare_manifests_with_fs_and_fetch( ) payload_consistent = not only_on_fs if not skip_remote: updated_remote_files = sorted(bag.remote_entries.keys()) existing_remote_files = sorted(list(bag.files_to_be_fetched(False))) unresolved_fetch_files = set(bag.files_to_be_fetched()) - set( bag.payload_files()) modified_remote_files = list( set(updated_remote_files) - set(existing_remote_files)) normalized_updated_remote_files = set() for filename in updated_remote_files: normalized_updated_remote_files.add(os.path.normpath(filename)) unresolved_manifest_files = list( set(only_in_manifests) - normalized_updated_remote_files) if modified_remote_files or only_in_fetch: payload_consistent = False if unresolved_manifest_files: payload_consistent = False if unresolved_fetch_files: payload_consistent = False for url, size, path in bag.fetch_entries(): output_path = os.path.normpath(os.path.join(bag.path, path)) if os.path.exists( output_path) and os.path.getsize(output_path) != int(size): payload_consistent = False elif payload_consistent: payload_consistent = not only_in_manifests for path in only_in_manifests: e = bagit.FileMissing(path) if not quiet: logger.warning( "%s. Resolve this file reference by either 1) adding the missing file to the bag payload or 2) adding " "a remote file reference in fetch.txt. or 3) re-run with the \"update\" flag set in order to remove " "this file from the bag manifest." % bdbag.get_named_exception(e)) for path in only_on_fs: e = bagit.UnexpectedFile(path) if not quiet: logger.warning( "%s. Re-run with the \"update\" flag set in order to add this file to the manifest." % bdbag.get_named_exception(e)) if not skip_remote: for path in only_in_fetch: e = bagit.UnexpectedRemoteFile(path) if not quiet: logger.warning( "%s. Ensure that any remote file references from fetch.txt are also present in the manifest and " "re-run with the \"update\" flag set in order to apply this change." % bdbag.get_named_exception(e)) return payload_consistent
def test_revert_bag(self): logger.info(self.getTestHeader('revert bag')) try: bdb.revert_bag(self.test_bag_dir) self.assertFalse(ospif(ospj(self.test_bag_dir, 'bag-info.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'bagit.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-md5.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'tagmanifest-md5.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt'))) self.assertTrue(ospif(ospj(self.test_bag_dir, 'README.txt'))) self.assertTrue( ospif(ospj(self.test_bag_dir, ospj('test1', 'test1.txt')))) self.assertTrue( ospif(ospj(self.test_bag_dir, ospj('test2', 'test2.txt')))) self.assertFalse(ospe(ospj(self.test_bag_dir, 'data'))) except Exception as e: self.fail(bdbag.get_named_exception(e))
def get_file(url, output_path, auth_config, credentials=None): try: if not credentials: credentials = get_credentials(url, auth_config) output_dir = os.path.dirname(os.path.abspath(output_path)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Attempting FTP retrieve from URL: %s" % url) creds = "%s:%s@" % (credentials[0] or "anonymous", credentials[1] or "*****@*****.**") url_parts = urlsplit(url) full_url = urlunsplit( (url_parts.scheme, "%s%s" % (creds, url_parts.netloc), url_parts.path, url_parts.query, url_parts.fragment)) start = datetime.datetime.now() logger.debug("Transferring file %s to %s" % (url, output_path)) urlretrieve(full_url, output_path) elapsed = datetime.datetime.now() - start total = os.path.getsize(output_path) totalSecs = elapsed.total_seconds() totalMBs = float(total) / float((1024 * 1024)) throughput = str("%.3f MB/second" % (totalMBs / totalSecs if totalSecs > 0 else 0.001)) logger.info( 'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. ' % (output_path, totalMBs, throughput, elapsed)) return True except Exception as e: logger.error('FTP Request Exception: %s' % (bdbag.get_named_exception(e))) logger.warning('File transfer failed: [%s]' % output_path) return False
def test_archive_bag_tar(self): logger.info(self.getTestHeader('archive bag tar format')) try: archive_file = bdb.archive_bag(self.test_bag_dir, 'tar') self.assertTrue(ospif(archive_file)) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_create_bag_with_config(self): logger.info(self.getTestHeader('create bag with config')) try: bag = bdb.make_bag(self.test_data_dir, config_file=(ospj(self.test_config_dir, 'test-config.json'))) self.assertIsInstance(bag, bagit.Bag) self.assertFalse( ospif(ospj(self.test_data_dir, 'manifest-sha1.txt'))) self.assertFalse( ospif(ospj(self.test_data_dir, 'manifest-sha256.txt'))) self.assertFalse( ospif(ospj(self.test_data_dir, 'manifest-sha512.txt'))) self.assertFalse( ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt'))) self.assertFalse( ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt'))) self.assertFalse( ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt'))) baginfo = ospj(self.test_data_dir, 'bag-info.txt') with open(baginfo) as bi: baginfo_txt = bi.read() self.assertIn('Contact-Name: bdbag test', baginfo_txt) except Exception as e: self.fail(bdbag.get_named_exception(e))
def get_session(url, auth_config): session = None response = None for auth in list( (entry for entry in auth_config if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))): try: if not validate_auth_config(auth): continue if not auth.auth_uri: continue if auth.uri in SESSIONS: session = SESSIONS[auth.uri] break else: session = get_new_session() if auth.auth_type == 'http-basic': session.auth = (auth.auth_params.username, auth.auth_params.password) auth_method = auth.auth_method.lower() if auth_method == 'post': response = session.post(auth.auth_uri, auth=session.auth) elif auth_method == 'get': response = session.get(auth.auth_uri, auth=session.auth) elif auth.auth_type == 'http-form': response = session.post( auth.auth_uri, { auth.auth_params.username_field: auth.auth_params.username, auth.auth_params.password_field: auth.auth_params.password }) if response.status_code > 203: logger.warn('Authentication failed with Status Code: %s %s\n' % (response.status_code, response.text)) else: logger.info("Session established: %s", auth.auth_uri) SESSIONS[auth.auth_uri] = session break except Exception as e: logger.warn( "Unhandled exception during HTTP(S) authentication: %s" % bdbag.get_named_exception(e)) if not session: url_parts = urlsplit(url) base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc)) session = SESSIONS.get(base_url, None) if not session: session = get_new_session() SESSIONS[base_url] = session return session
def test_create_bag(self): logger.info(self.getTestHeader('create bag')) try: bag = bdb.make_bag(self.test_data_dir) self.assertIsInstance(bag, bagit.Bag) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_remote(self): logger.info(self.getTestHeader('update bag add remote file manifest')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, remote_file_manifest=ospj( self.test_config_dir, 'test-fetch-manifest.json')) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages([ 'Generating remote file references from', 'test-fetch-manifest.json' ], output) fetch_file = ospj(self.test_bag_dir, 'fetch.txt') self.assertTrue(ospif(fetch_file)) with open(fetch_file) as ff: fetch_txt = ff.read() self.assertIn( 'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json' '\t723\tdata/bdbag-profile.json', fetch_txt) self.assertIn( 'ark:/88120/r8059v\t632860\tdata/minid_v0.1_Nov_2015.pdf', fetch_txt) except Exception as e: self.fail(bdbag.get_named_exception(e))
def resolve(ark): if ark is None: return None urls = [] resolver_url = ''.join((RESOLVER_URL, '/', ark)) logger.info("Attempting to resolve %s into a valid set of URLs." % ark) r = requests.get(resolver_url, headers={'accept': 'application/json', 'Connection': 'close'}) if r.status_code != 200: logger.error('HTTP GET Failed for: %s' % r.url) logger.error("Host %s responded:\n\n%s" % (urlsplit(r.url).netloc, r.text)) else: info = {} try: info = json.loads(r.text, object_pairs_hook=OrderedDict) except Exception as e: logger.warn("Unable to parse ARK resolution result, a MINID or other supported JSON metadata structure " "was not found. Exception: %s" % bdbag.get_named_exception(e)) # need a better way to validate minid response structure locations = info.get('locations', list()) for location in locations: uri = location.get('uri', None) if uri: urls.append(uri) if urls: logger.info("The identifier %s resolved into the following locations: %s" % (ark, urls)) else: logger.warn("No file locations were found for identifier %s" % ark) return urls
def test_validate_profile(self): logger.info(self.getTestHeader('validate profile')) try: profile = bdb.validate_bag_profile(self.test_bag_dir) self.assertIsInstance(profile, bagit_profile.Profile) except Exception as e: self.fail(bdbag.get_named_exception(e))
def get_file(url, output_path, auth_config, token=None, dest_endpoint=None): try: src_endpoint = urlsplit(url).hostname src_path = urlsplit(url).path if platform.system() == "Windows": dest_path = ''.join( ('/', output_path.replace('\\', '/').replace(':', ''))) else: dest_path = os.path.abspath(output_path) if not token: token, dest_endpoint = authenticate(url, auth_config) if token is None: logger.warn( "A valid Globus access token is required to create transfers. " "Check keychain.cfg for valid parameters.") return False if dest_endpoint is None: logger.warn( "A valid Globus destination endpoint must be specified. " "Check keychain.cfg for valid parameters.") return False # initialize transfer client client = globus_sdk.TransferClient(token=token) # Activate source endpoint logger.debug("Activating source endpoint: %s" % src_endpoint) data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600) # Activate destination endpoint logger.debug("Activating destination endpoint: %s" % dest_endpoint) data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600) filename = src_path.rsplit('/', 1)[-1] label = "".join(("BDBag Fetch -- ", filename.replace('.', '_'))) # get a unique ID for this transfer tdata = globus_sdk.TransferData(client, src_endpoint, dest_endpoint, label=label) tdata.add_item(src_path, dest_path, recursive=False) # start the transfer data = client.submit_transfer(tdata) task_id = data["task_id"] logger.info("Globus transfer started with ID %s" % task_id) logger.debug("Transferring file %s to %s" % (url, output_path)) return True except Exception as e: logger.error('Globus transfer request exception: %s' % bdbag.get_named_exception(e)) return False
def test_resolve_fetch_ark(self): logger.info(self.getTestHeader('test resolve fetch ark')) try: bdb.resolve_fetch(self.test_bag_fetch_ark_dir) bdb.validate_bag(self.test_bag_fetch_ark_dir, fast=False) output = self.stream.getvalue() except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_validate_profile_serialization(self): logger.info(self.getTestHeader('validate profile serialization')) try: bag_path = ospj(self.test_archive_dir, 'test-bag.zip') bdb.validate_bag_serialization( bag_path, bag_profile_path='https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json') except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_validate_incomplete_bag_full(self): logger.info(self.getTestHeader('test full validation incomplete bag')) try: self.assertRaises(bagit.BagValidationError, bdb.validate_bag, self.test_bag_incomplete_dir, fast=False) output = self.stream.getvalue() self.assertExpectedMessages( ['bdbag-profile.json does not exist', 'minid_v0.1_Nov_2015.pdf does not exist'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_extract_bag_archive_tar(self): logger.info(self.getTestHeader('extract bag tar format')) try: bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bdb.cleanup_bag(os.path.dirname(bag_path)) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_validate_incomplete_bag_fast(self): logger.info(self.getTestHeader('test fast validation incomplete bag')) try: self.assertRaises(bagit.BagIncompleteError, bdb.validate_bag, self.test_bag_incomplete_dir, fast=True) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_remove_file(self): logger.info(self.getTestHeader('update bag remove file')) try: os.remove(ospj(self.test_bag_dir, 'data', 'test1', 'test1.txt')) bag = bdb.make_bag(self.test_bag_dir, update=True) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertUnexpectedMessages(['test1.txt'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def get_file(url, output_path, token=None, dest_endpoint=None): try: src_endpoint = urlsplit(url).hostname src_path = urlsplit(url).path if platform.system() == "Windows": dest_path = ''.join(('/', output_path.replace('\\', '/').replace(':', ''))) else: dest_path = os.path.abspath(output_path) if not token: token, dest_endpoint = authenticate(url) if token is None: logger.warn("A valid Globus access token is required to create transfers. " "Check keychain.cfg for valid parameters.") return False if dest_endpoint is None: logger.warn("A valid Globus destination endpoint must be specified. " "Check keychain.cfg for valid parameters.") return False # initialize transfer client client = globus_sdk.TransferClient(token=token) # Activate source endpoint logger.debug("Activating source endpoint: %s" % src_endpoint) data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600) # Activate destination endpoint logger.debug("Activating destination endpoint: %s" % dest_endpoint) data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600) filename = src_path.rsplit('/', 1)[-1] label = "".join(("BDBag Fetch -- ", filename.replace('.', '_'))) # get a unique ID for this transfer tdata = globus_sdk.TransferData(client, src_endpoint, dest_endpoint, label=label) tdata.add_item(src_path, dest_path, recursive=False) # start the transfer data = client.submit_transfer(tdata) task_id = data["task_id"] logger.info("Globus transfer started with ID %s" % task_id) logger.debug("Transferring file %s to %s" % (url, output_path)) return True except Exception as e: logger.error('Globus transfer request exception: %s' % bdbag.get_named_exception(e)) return False
def authenticate(url): for auth in list((entry for entry in KEYCHAIN if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))): try: if not validate_auth_config(auth): continue return auth.auth_params.token, auth.auth_params.local_endpoint except Exception as e: logger.warn("Unhandled exception getting Globus token: %s" % bdbag.get_named_exception(e)) return None, None
def test_validate_profile_serialization(self): logger.info(self.getTestHeader('validate profile serialization')) try: bag_path = ospj(self.test_archive_dir, 'test-bag.zip') bdb.validate_bag_serialization( bag_path, bag_profile_path= 'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json' ) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_change_file(self): logger.info(self.getTestHeader('update bag change file')) try: with open(ospj(self.test_bag_dir, 'data', 'README.txt'), 'a') as f: f.writelines('Additional data added via unit test.') bag = bdb.make_bag(self.test_bag_dir, update=True) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['README.txt'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_change_metadata(self): logger.info(self.getTestHeader('update bag change metadata')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def check_payload_consistency(bag, skip_remote=False, quiet=False): only_in_manifests, only_on_fs, only_in_fetch = bag.compare_manifests_with_fs_and_fetch() payload_consistent = not only_on_fs if not skip_remote: updated_remote_files = sorted(bag.remote_entries.keys()) existing_remote_files = sorted(list(bag.files_to_be_fetched(False))) modified_remote_files = list(set(updated_remote_files) - set(existing_remote_files)) normalized_updated_remote_files = set() for filename in updated_remote_files: normalized_updated_remote_files.add(os.path.normpath(filename)) unresolved_manifest_files = list(set(only_in_manifests) - normalized_updated_remote_files) if modified_remote_files or only_in_fetch: payload_consistent = False if unresolved_manifest_files: payload_consistent = False elif payload_consistent: payload_consistent = not only_in_manifests for path in only_in_manifests: e = bagit.FileMissing(path) if not quiet: logger.warning( "%s. Resolve this file reference or re-run with the \"update\" flag set in order to remove this file " "from the manifest." % bdbag.get_named_exception(e)) for path in only_on_fs: e = bagit.UnexpectedFile(path) if not quiet: logger.warning( "%s. Re-run with the \"update\" flag set in order to add this file to the manifest." % bdbag.get_named_exception(e)) if not skip_remote: for path in only_in_fetch: e = bagit.UnexpectedRemoteFile(path) if not quiet: logger.warning( "%s. Ensure that any remote file references from fetch.txt are also present in the manifest and " "re-run with the \"update\" flag set in order to apply this change." % bdbag.get_named_exception(e)) return payload_consistent
def get_file(url, output_path, auth_config, headers=None, session=None): try: if not session: session = get_session(url, auth_config) output_dir = os.path.dirname(os.path.abspath(output_path)) if not os.path.exists(output_dir): os.makedirs(output_dir) if not headers: headers = HEADERS else: headers.update(HEADERS) logger.info("Attempting GET from URL: %s" % url) r = session.get(url, headers=headers, stream=True, verify=certifi.where()) if r.status_code == 401: session = get_session(url, auth_config) r = session.get(url, headers=headers, stream=True, verify=certifi.where()) if r.status_code != 200: logger.error('HTTP GET Failed for URL: %s' % url) logger.error("Host %s responded:\n\n%s" % (urlsplit(url).netloc, r.text)) logger.warn('File transfer failed: [%s]' % output_path) else: total = 0 start = datetime.datetime.now() logger.debug("Transferring file %s to %s" % (url, output_path)) with open(output_path, 'wb') as data_file: for chunk in r.iter_content(chunk_size=CHUNK_SIZE): data_file.write(chunk) total += len(chunk) elapsed = datetime.datetime.now() - start totalSecs = elapsed.total_seconds() totalMBs = total / (1024 * 1024) throughput = str( "%.3f MB/second" % (totalMBs / totalSecs if totalSecs > 0 else 0.001)) logger.info( 'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. ' % (output_path, totalMBs, throughput, elapsed)) return True except requests.exceptions.RequestException as e: logger.error('HTTP Request Exception: %s' % (bdbag.get_named_exception(e))) return False
def test_update_bag_prune(self): logger.info(self.getTestHeader('update bag prune manifests')) try: bag = bdb.make_bag(self.test_bag_dir, algs=['md5'], update=True, prune_manifests=True) self.assertIsInstance(bag, bagit.Bag) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt'))) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_validate_incomplete_bag_full(self): logger.info(self.getTestHeader('test full validation incomplete bag')) try: self.assertRaises(bagit.BagValidationError, bdb.validate_bag, self.test_bag_incomplete_dir, fast=False) output = self.stream.getvalue() self.assertExpectedMessages([ 'bdbag-profile.json does not exist', 'minid_v0.1_Nov_2015.pdf does not exist' ], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_change_metadata(self): logger.info(self.getTestHeader('update bag change metadata')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages( ['Reading bag metadata from file', 'test-metadata.json'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def authenticate(url, auth_config): for auth in list( (entry for entry in auth_config if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))): try: if not validate_auth_config(auth): continue if auth.auth_type == 'token': return auth.auth_params.transfer_token, auth.auth_params.local_endpoint except Exception as e: logger.warn("Unhandled exception getting Globus token: %s" % bdbag.get_named_exception(e)) return None, None
def read_keychain(keychain_file, create_default=True): keychain = json.dumps(DEFAULT_KEYCHAIN) if keychain_file == DEFAULT_KEYCHAIN_FILE and not os.path.isfile(keychain_file) and create_default: logger.debug("No keychain file specified and no default keychain file found, attempting to create one.") try: create_default_keychain() except Exception as e: logger.warning( "Unable to create default keychain file. A keychain file is required for authentication when " "retrieving files from protected remote resources. Either ensure that the default keychain " "file %s can be created or provide an a different path to a valid keychain file. Error: %s" % (DEFAULT_KEYCHAIN_FILE, bdbag.get_named_exception(e))) if os.path.isfile(keychain_file): with open(keychain_file) as kf: keychain = kf.read() return json.loads(keychain, object_hook=lambda d: collections.namedtuple('Auth', d.keys())(*d.values()))
def read_config(config_file, create_default=True): config = json.dumps(bdbag.DEFAULT_CONFIG) if config_file == bdbag.DEFAULT_CONFIG_FILE and not os.path.isfile( config_file) and create_default: logger.debug( "No default configuration file found, attempting to create one.") try: create_default_config() except Exception as e: logger.debug( "Unable to create default configuration file %s. Using internal defaults. %s" % (bdbag.DEFAULT_CONFIG_FILE, bdbag.get_named_exception(e))) if os.path.isfile(config_file): with open(config_file) as cf: config = cf.read() return json.loads(config, object_pairs_hook=OrderedDict)
def test_update_bag_change_metadata_only(self): logger.info(self.getTestHeader('update bag change metadata only - do not save manifests')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, save_manifests=False, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output) self.assertUnexpectedMessages(['updating manifest-sha1.txt', 'updating manifest-sha256.txt', 'updating manifest-sha512.txt', 'updating manifest-md5.txt'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_create_bag_with_config(self): logger.info(self.getTestHeader('create bag with config')) try: bag = bdb.make_bag(self.test_data_dir, config_file=(ospj(self.test_config_dir, 'test-config.json'))) self.assertIsInstance(bag, bagit.Bag) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha512.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt'))) baginfo = ospj(self.test_data_dir, 'bag-info.txt') with open(baginfo) as bi: baginfo_txt = bi.read() self.assertIn('Contact-Name: bdbag test', baginfo_txt) except Exception as e: self.fail(bdbag.get_named_exception(e))
def get_session(url): session = None response = None for auth in list((entry for entry in KEYCHAIN if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))): try: if not validate_auth_config(auth): continue if not auth.auth_uri: continue if auth.uri in SESSIONS: session = SESSIONS[auth.uri] break else: session = requests.session() if auth.auth_type == 'http-basic': session.auth = (auth.auth_params.username, auth.auth_params.password) auth_method = auth.auth_method.lower() if auth_method == 'post': response = session.post(auth.auth_uri, auth=session.auth) elif auth_method == 'get': response = session.get(auth.auth_uri, auth=session.auth) elif auth.auth_type == 'http-form': response = session.post(auth.auth_uri, {auth.auth_params.username_field: auth.auth_params.username, auth.auth_params.password_field: auth.auth_params.password}) if response.status_code > 203: logger.warn('Authentication failed with Status Code: %s %s\n' % (response.status_code, response.text)) else: logger.info("Session established: %s", auth.auth_uri) SESSIONS[auth.auth_uri] = session break except Exception as e: logger.warn("Unhandled exception during HTTP(S) authentication: %s" % bdbag.get_named_exception(e)) return session if session else requests.session()
def test_update_bag_remote(self): logger.info(self.getTestHeader('update bag add remote file manifest')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json')) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['Generating remote file references from', 'test-fetch-manifest.json'], output) fetch_file = ospj(self.test_bag_dir, 'fetch.txt') self.assertTrue(ospif(fetch_file)) with open(fetch_file) as ff: fetch_txt = ff.read() self.assertIn( 'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json' '\t723\tdata/bdbag-profile.json', fetch_txt) self.assertIn( 'ark:/88120/r8059v\t632860\tdata/minid_v0.1_Nov_2015.pdf', fetch_txt) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_change_metadata_only(self): logger.info( self.getTestHeader( 'update bag change metadata only - do not save manifests')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, save_manifests=False, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages( ['Reading bag metadata from file', 'test-metadata.json'], output) self.assertUnexpectedMessages([ 'updating manifest-sha1.txt', 'updating manifest-sha256.txt', 'updating manifest-sha512.txt', 'updating manifest-md5.txt' ], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_prune(self): logger.info(self.getTestHeader('update bag prune manifests')) try: bag = bdb.make_bag(self.test_bag_dir, algs=['md5'], update=True, prune_manifests=True) self.assertIsInstance(bag, bagit.Bag) self.assertFalse( ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt'))) self.assertFalse( ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt'))) except Exception as e: self.fail(bdbag.get_named_exception(e))
def validate_bag(bag_path, fast=False, config_file=bdbag.DEFAULT_CONFIG_FILE): config = read_config(config_file) bag_config = config['bag_config'] bag_processes = bag_config.get('bag_processes', 1) try: logger.info("Validating bag: %s" % bag_path) bag = bagit.Bag(bag_path) bag.validate(bag_processes, fast=fast) logger.info("Bag %s is valid" % bag_path) except bagit.BagIncompleteError as e: logger.warning("BagIncompleteError: %s %s", e, "This validation error may be transient if the bag contains unresolved remote file references " "from a fetch.txt file. In this case the bag is incomplete but not necessarily invalid. " "Resolve remote file references (if any) and re-validate.") raise e except bagit.BagValidationError as e: errors = list() for d in e.details: errors.append(bdbag.get_named_exception(d)) raise bagit.BagValidationError('\nError: '.join(errors)) except Exception as e: raise RuntimeError("Unhandled exception while validating bag: %s" % e)
def test_validate_complete_bag_fast(self): logger.info(self.getTestHeader('test fast validation complete bag')) try: bdb.validate_bag(self.test_bag_dir, fast=True) except Exception as e: self.fail(bdbag.get_named_exception(e))
def get_session(url, auth_config): session = None response = None for auth in list( (entry for entry in auth_config if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))): try: if not validate_auth_config(auth): continue if auth.uri in SESSIONS: session = SESSIONS[auth.uri] break else: session = get_new_session() if auth.auth_type == 'cookie': if auth.auth_params and hasattr(auth.auth_params, 'cookies'): cookies = auth.auth_params.cookies for cookie in cookies: name, value = cookie.split('=', 1) session.cookies.set(name, value, domain=urlsplit(auth.uri).hostname, path='/') SESSIONS[auth.uri] = session break # if we get here the assumption is that the auth_type is either http-basic or http-form if not keychain.has_auth_attr(auth, 'auth_uri'): logging.warning( "Missing required parameter [auth_uri] for auth_type [%s] for keychain entry [%s]" % (auth.auth_type, auth.uri)) continue if not (keychain.has_auth_attr(auth.auth_params, 'username') and keychain.has_auth_attr(auth.auth_params, 'password')): logging.warning( "Missing required parameters [username, password] for auth_type [%s] for keychain entry [%s]" % (auth.auth_type, auth.uri)) continue if auth.auth_type == 'http-basic': session.auth = (auth.auth_params.username, auth.auth_params.password) auth_method = "post" if keychain.has_auth_attr(auth.auth_params, 'auth_method'): auth_method = auth.auth_params.auth_method.lower() if auth_method == 'post': response = session.post(auth.auth_uri, auth=session.auth) elif auth_method == 'get': response = session.get(auth.auth_uri, auth=session.auth) else: logging.warning( "Unsupported auth_method [%s] for auth_type [%s] for keychain entry [%s]" % (auth_method, auth.auth_type, auth.uri)) elif auth.auth_type == 'http-form': response = session.post( auth.auth_uri, { auth.auth_params.username_field or "username": auth.auth_params.username, auth.auth_params.password_field or "password": auth.auth_params.password }) if response.status_code > 203: logger.warning( 'Authentication failed with Status Code: %s %s\n' % (response.status_code, response.text)) else: logger.info("Session established: %s", auth.auth_uri) SESSIONS[auth.auth_uri] = session break except Exception as e: logger.warning( "Unhandled exception during HTTP(S) authentication: %s" % bdbag.get_named_exception(e)) if not session: url_parts = urlsplit(url) base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc)) session = SESSIONS.get(base_url, None) if not session: session = get_new_session() SESSIONS[base_url] = session return session
def main(): sys.stderr.write('\n') args, is_bag, is_file = parse_cli() path = os.path.abspath(args.bag_path) archive = None temp_path = None error = None result = 0 try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ( (args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, args.checksum, args.update, args.skip_manifests, args.prune_manifests, BAG_METADATA if BAG_METADATA else None, args.metadata_file, args.remote_file_manifest, args.config_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) sys.stderr.write('\n') return result if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch( path, force=True if args.resolve_fetch == 'all' else False, keychain_file=args.keychain_file) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) bdb.validate_bag(temp_path if temp_path else path, fast=True if args.validate == 'fast' else False, config_file=args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile( temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) if args.revert: bdb.revert_bag(path) except Exception as e: result = 1 error = "Error: %s" % bdbag.get_named_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) sys.stderr.write('\n') return result
def main(): sys.stderr.write('\n') args, is_bag, is_file = parse_cli() path = os.path.abspath(args.bag_path) archive = None temp_path = None error = None result = 0 try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ((args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, args.checksum, args.update, args.skip_manifests, args.prune_manifests, BAG_METADATA if BAG_METADATA else None, args.metadata_file, args.remote_file_manifest, args.config_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) sys.stderr.write('\n') return result if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) bdb.validate_bag(temp_path if temp_path else path, True if args.validate == 'fast' else False, args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile(temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) except Exception as e: result = 1 error = "Error: %s" % bdbag.get_named_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) sys.stderr.write('\n') return result