Пример #1
0
def validate_bag(bag_path,
                 fast=False,
                 callback=None,
                 config_file=bdbag.DEFAULT_CONFIG_FILE):
    config = read_config(config_file)
    bag_config = config['bag_config']
    bag_processes = bag_config.get('bag_processes', 1)

    try:
        logger.info("Validating bag: %s" % bag_path)
        bag = bagit.Bag(bag_path)
        bag.validate(bag_processes if not callback else 1,
                     fast=fast,
                     callback=callback)
        logger.info("Bag %s is valid" % bag_path)
    except bagit.BagIncompleteError as e:
        logger.warning(
            "BagIncompleteError: %s %s", e,
            "This validation error may be transient if the bag contains unresolved remote file references "
            "from a fetch.txt file. In this case the bag is incomplete but not necessarily invalid. "
            "Resolve remote file references (if any) and re-validate.")
        raise e
    except bagit.BagValidationError as e:
        errors = list()
        for d in e.details:
            errors.append(bdbag.get_named_exception(d))
        raise bagit.BagValidationError('\nError: '.join(errors))
    except bagit.InterruptedError as e:
        logger.warn(bdbag.get_named_exception(e))
        raise e
    except Exception as e:
        raise RuntimeError("Unhandled exception while validating bag: %s" % e)
Пример #2
0
def check_payload_consistency(bag, skip_remote=False, quiet=False):

    only_in_manifests, only_on_fs, only_in_fetch = bag.compare_manifests_with_fs_and_fetch(
    )
    payload_consistent = not only_on_fs
    if not skip_remote:
        updated_remote_files = sorted(bag.remote_entries.keys())
        existing_remote_files = sorted(list(bag.files_to_be_fetched(False)))
        unresolved_fetch_files = set(bag.files_to_be_fetched()) - set(
            bag.payload_files())
        modified_remote_files = list(
            set(updated_remote_files) - set(existing_remote_files))
        normalized_updated_remote_files = set()
        for filename in updated_remote_files:
            normalized_updated_remote_files.add(os.path.normpath(filename))
        unresolved_manifest_files = list(
            set(only_in_manifests) - normalized_updated_remote_files)
        if modified_remote_files or only_in_fetch:
            payload_consistent = False
        if unresolved_manifest_files:
            payload_consistent = False
        if unresolved_fetch_files:
            payload_consistent = False
        for url, size, path in bag.fetch_entries():
            output_path = os.path.normpath(os.path.join(bag.path, path))
            if os.path.exists(
                    output_path) and os.path.getsize(output_path) != int(size):
                payload_consistent = False
    elif payload_consistent:
        payload_consistent = not only_in_manifests

    for path in only_in_manifests:
        e = bagit.FileMissing(path)
        if not quiet:
            logger.warning(
                "%s. Resolve this file reference by either 1) adding the missing file to the bag payload or 2) adding "
                "a remote file reference in fetch.txt. or 3) re-run with the \"update\" flag set in order to remove "
                "this file from the bag manifest." %
                bdbag.get_named_exception(e))
    for path in only_on_fs:
        e = bagit.UnexpectedFile(path)
        if not quiet:
            logger.warning(
                "%s. Re-run with the \"update\" flag set in order to add this file to the manifest."
                % bdbag.get_named_exception(e))
    if not skip_remote:
        for path in only_in_fetch:
            e = bagit.UnexpectedRemoteFile(path)
            if not quiet:
                logger.warning(
                    "%s. Ensure that any remote file references from fetch.txt are also present in the manifest and "
                    "re-run with the \"update\" flag set in order to apply this change."
                    % bdbag.get_named_exception(e))

    return payload_consistent
Пример #3
0
 def test_revert_bag(self):
     logger.info(self.getTestHeader('revert bag'))
     try:
         bdb.revert_bag(self.test_bag_dir)
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'bag-info.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'bagit.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir,
                                     'manifest-md5.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'tagmanifest-md5.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt')))
         self.assertTrue(ospif(ospj(self.test_bag_dir, 'README.txt')))
         self.assertTrue(
             ospif(ospj(self.test_bag_dir, ospj('test1', 'test1.txt'))))
         self.assertTrue(
             ospif(ospj(self.test_bag_dir, ospj('test2', 'test2.txt'))))
         self.assertFalse(ospe(ospj(self.test_bag_dir, 'data')))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #4
0
def get_file(url, output_path, auth_config, credentials=None):

    try:
        if not credentials:
            credentials = get_credentials(url, auth_config)
        output_dir = os.path.dirname(os.path.abspath(output_path))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        logger.info("Attempting FTP retrieve from URL: %s" % url)
        creds = "%s:%s@" % (credentials[0] or "anonymous", credentials[1]
                            or "*****@*****.**")
        url_parts = urlsplit(url)
        full_url = urlunsplit(
            (url_parts.scheme, "%s%s" % (creds, url_parts.netloc),
             url_parts.path, url_parts.query, url_parts.fragment))
        start = datetime.datetime.now()
        logger.debug("Transferring file %s to %s" % (url, output_path))
        urlretrieve(full_url, output_path)
        elapsed = datetime.datetime.now() - start
        total = os.path.getsize(output_path)
        totalSecs = elapsed.total_seconds()
        totalMBs = float(total) / float((1024 * 1024))
        throughput = str("%.3f MB/second" %
                         (totalMBs / totalSecs if totalSecs > 0 else 0.001))
        logger.info(
            'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. '
            % (output_path, totalMBs, throughput, elapsed))
        return True

    except Exception as e:
        logger.error('FTP Request Exception: %s' %
                     (bdbag.get_named_exception(e)))
        logger.warning('File transfer failed: [%s]' % output_path)

    return False
Пример #5
0
 def test_archive_bag_tar(self):
     logger.info(self.getTestHeader('archive bag tar format'))
     try:
         archive_file = bdb.archive_bag(self.test_bag_dir, 'tar')
         self.assertTrue(ospif(archive_file))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #6
0
 def test_create_bag_with_config(self):
     logger.info(self.getTestHeader('create bag with config'))
     try:
         bag = bdb.make_bag(self.test_data_dir,
                            config_file=(ospj(self.test_config_dir,
                                              'test-config.json')))
         self.assertIsInstance(bag, bagit.Bag)
         self.assertFalse(
             ospif(ospj(self.test_data_dir, 'manifest-sha1.txt')))
         self.assertFalse(
             ospif(ospj(self.test_data_dir, 'manifest-sha256.txt')))
         self.assertFalse(
             ospif(ospj(self.test_data_dir, 'manifest-sha512.txt')))
         self.assertFalse(
             ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(
             ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(
             ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt')))
         baginfo = ospj(self.test_data_dir, 'bag-info.txt')
         with open(baginfo) as bi:
             baginfo_txt = bi.read()
         self.assertIn('Contact-Name: bdbag test', baginfo_txt)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #7
0
 def test_archive_bag_tar(self):
     logger.info(self.getTestHeader('archive bag tar format'))
     try:
         archive_file = bdb.archive_bag(self.test_bag_dir, 'tar')
         self.assertTrue(ospif(archive_file))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #8
0
def get_session(url, auth_config):

    session = None
    response = None

    for auth in list(
        (entry for entry in auth_config
         if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))):

        try:
            if not validate_auth_config(auth):
                continue

            if not auth.auth_uri:
                continue

            if auth.uri in SESSIONS:
                session = SESSIONS[auth.uri]
                break
            else:
                session = get_new_session()

            if auth.auth_type == 'http-basic':
                session.auth = (auth.auth_params.username,
                                auth.auth_params.password)
                auth_method = auth.auth_method.lower()
                if auth_method == 'post':
                    response = session.post(auth.auth_uri, auth=session.auth)
                elif auth_method == 'get':
                    response = session.get(auth.auth_uri, auth=session.auth)
            elif auth.auth_type == 'http-form':
                response = session.post(
                    auth.auth_uri, {
                        auth.auth_params.username_field:
                        auth.auth_params.username,
                        auth.auth_params.password_field:
                        auth.auth_params.password
                    })
            if response.status_code > 203:
                logger.warn('Authentication failed with Status Code: %s %s\n' %
                            (response.status_code, response.text))
            else:
                logger.info("Session established: %s", auth.auth_uri)
                SESSIONS[auth.auth_uri] = session
                break

        except Exception as e:
            logger.warn(
                "Unhandled exception during HTTP(S) authentication: %s" %
                bdbag.get_named_exception(e))

    if not session:
        url_parts = urlsplit(url)
        base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc))
        session = SESSIONS.get(base_url, None)
        if not session:
            session = get_new_session()
            SESSIONS[base_url] = session

    return session
Пример #9
0
 def test_create_bag(self):
     logger.info(self.getTestHeader('create bag'))
     try:
         bag = bdb.make_bag(self.test_data_dir)
         self.assertIsInstance(bag, bagit.Bag)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #10
0
 def test_update_bag_remote(self):
     logger.info(self.getTestHeader('update bag add remote file manifest'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            remote_file_manifest=ospj(
                                self.test_config_dir,
                                'test-fetch-manifest.json'))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages([
             'Generating remote file references from',
             'test-fetch-manifest.json'
         ], output)
         fetch_file = ospj(self.test_bag_dir, 'fetch.txt')
         self.assertTrue(ospif(fetch_file))
         with open(fetch_file) as ff:
             fetch_txt = ff.read()
         self.assertIn(
             'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json'
             '\t723\tdata/bdbag-profile.json', fetch_txt)
         self.assertIn(
             'ark:/88120/r8059v\t632860\tdata/minid_v0.1_Nov_2015.pdf',
             fetch_txt)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #11
0
def resolve(ark):

    if ark is None:
        return None

    urls = []
    resolver_url = ''.join((RESOLVER_URL, '/', ark))
    logger.info("Attempting to resolve %s into a valid set of URLs." % ark)
    r = requests.get(resolver_url, headers={'accept': 'application/json', 'Connection': 'close'})
    if r.status_code != 200:
        logger.error('HTTP GET Failed for: %s' % r.url)
        logger.error("Host %s responded:\n\n%s" % (urlsplit(r.url).netloc, r.text))
    else:
        info = {}
        try:
            info = json.loads(r.text, object_pairs_hook=OrderedDict)
        except Exception as e:
            logger.warn("Unable to parse ARK resolution result, a MINID or other supported JSON metadata structure "
                        "was not found. Exception: %s" % bdbag.get_named_exception(e))
        # need a better way to validate minid response structure
        locations = info.get('locations', list())
        for location in locations:
            uri = location.get('uri', None)
            if uri:
                urls.append(uri)

    if urls:
        logger.info("The identifier %s resolved into the following locations: %s" % (ark, urls))
    else:
        logger.warn("No file locations were found for identifier %s" % ark)

    return urls
Пример #12
0
 def test_validate_profile(self):
     logger.info(self.getTestHeader('validate profile'))
     try:
         profile = bdb.validate_bag_profile(self.test_bag_dir)
         self.assertIsInstance(profile, bagit_profile.Profile)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #13
0
 def test_create_bag(self):
     logger.info(self.getTestHeader('create bag'))
     try:
         bag = bdb.make_bag(self.test_data_dir)
         self.assertIsInstance(bag, bagit.Bag)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #14
0
def get_file(url, output_path, auth_config, token=None, dest_endpoint=None):

    try:
        src_endpoint = urlsplit(url).hostname
        src_path = urlsplit(url).path
        if platform.system() == "Windows":
            dest_path = ''.join(
                ('/', output_path.replace('\\', '/').replace(':', '')))
        else:
            dest_path = os.path.abspath(output_path)

        if not token:
            token, dest_endpoint = authenticate(url, auth_config)
        if token is None:
            logger.warn(
                "A valid Globus access token is required to create transfers. "
                "Check keychain.cfg for valid parameters.")
            return False

        if dest_endpoint is None:
            logger.warn(
                "A valid Globus destination endpoint must be specified. "
                "Check keychain.cfg for valid parameters.")
            return False

        # initialize transfer client
        client = globus_sdk.TransferClient(token=token)

        # Activate source endpoint
        logger.debug("Activating source endpoint: %s" % src_endpoint)
        data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600)

        # Activate destination endpoint
        logger.debug("Activating destination endpoint: %s" % dest_endpoint)
        data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600)

        filename = src_path.rsplit('/', 1)[-1]
        label = "".join(("BDBag Fetch -- ", filename.replace('.', '_')))

        # get a unique ID for this transfer
        tdata = globus_sdk.TransferData(client,
                                        src_endpoint,
                                        dest_endpoint,
                                        label=label)

        tdata.add_item(src_path, dest_path, recursive=False)

        # start the transfer
        data = client.submit_transfer(tdata)
        task_id = data["task_id"]

        logger.info("Globus transfer started with ID %s" % task_id)
        logger.debug("Transferring file %s to %s" % (url, output_path))
        return True

    except Exception as e:
        logger.error('Globus transfer request exception: %s' %
                     bdbag.get_named_exception(e))

    return False
Пример #15
0
 def test_validate_profile(self):
     logger.info(self.getTestHeader('validate profile'))
     try:
         profile = bdb.validate_bag_profile(self.test_bag_dir)
         self.assertIsInstance(profile, bagit_profile.Profile)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #16
0
 def test_resolve_fetch_ark(self):
     logger.info(self.getTestHeader('test resolve fetch ark'))
     try:
         bdb.resolve_fetch(self.test_bag_fetch_ark_dir)
         bdb.validate_bag(self.test_bag_fetch_ark_dir, fast=False)
         output = self.stream.getvalue()
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #17
0
 def test_resolve_fetch_ark(self):
     logger.info(self.getTestHeader('test resolve fetch ark'))
     try:
         bdb.resolve_fetch(self.test_bag_fetch_ark_dir)
         bdb.validate_bag(self.test_bag_fetch_ark_dir, fast=False)
         output = self.stream.getvalue()
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #18
0
 def test_validate_profile_serialization(self):
     logger.info(self.getTestHeader('validate profile serialization'))
     try:
         bag_path = ospj(self.test_archive_dir, 'test-bag.zip')
         bdb.validate_bag_serialization(
             bag_path,
             bag_profile_path='https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json')
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #19
0
 def test_validate_incomplete_bag_full(self):
     logger.info(self.getTestHeader('test full validation incomplete bag'))
     try:
         self.assertRaises(bagit.BagValidationError, bdb.validate_bag, self.test_bag_incomplete_dir, fast=False)
         output = self.stream.getvalue()
         self.assertExpectedMessages(
             ['bdbag-profile.json does not exist', 'minid_v0.1_Nov_2015.pdf does not exist'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #20
0
 def test_extract_bag_archive_tar(self):
     logger.info(self.getTestHeader('extract bag tar format'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #21
0
 def test_validate_incomplete_bag_fast(self):
     logger.info(self.getTestHeader('test fast validation incomplete bag'))
     try:
         self.assertRaises(bagit.BagIncompleteError,
                           bdb.validate_bag,
                           self.test_bag_incomplete_dir,
                           fast=True)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #22
0
 def test_update_bag_remove_file(self):
     logger.info(self.getTestHeader('update bag remove file'))
     try:
         os.remove(ospj(self.test_bag_dir, 'data', 'test1', 'test1.txt'))
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertUnexpectedMessages(['test1.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #23
0
 def test_update_bag_remove_file(self):
     logger.info(self.getTestHeader('update bag remove file'))
     try:
         os.remove(ospj(self.test_bag_dir, 'data', 'test1', 'test1.txt'))
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertUnexpectedMessages(['test1.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #24
0
def get_file(url, output_path, token=None, dest_endpoint=None):

    try:
        src_endpoint = urlsplit(url).hostname
        src_path = urlsplit(url).path
        if platform.system() == "Windows":
            dest_path = ''.join(('/', output_path.replace('\\', '/').replace(':', '')))
        else:
            dest_path = os.path.abspath(output_path)

        if not token:
            token, dest_endpoint = authenticate(url)
        if token is None:
            logger.warn("A valid Globus access token is required to create transfers. "
                        "Check keychain.cfg for valid parameters.")
            return False

        if dest_endpoint is None:
            logger.warn("A valid Globus destination endpoint must be specified. "
                        "Check keychain.cfg for valid parameters.")
            return False

        # initialize transfer client
        client = globus_sdk.TransferClient(token=token)

        # Activate source endpoint
        logger.debug("Activating source endpoint: %s" % src_endpoint)
        data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600)

        # Activate destination endpoint
        logger.debug("Activating destination endpoint: %s" % dest_endpoint)
        data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600)

        filename = src_path.rsplit('/', 1)[-1]
        label = "".join(("BDBag Fetch -- ", filename.replace('.', '_')))

        # get a unique ID for this transfer
        tdata = globus_sdk.TransferData(client, src_endpoint,
                                            dest_endpoint,
                                            label=label)

        tdata.add_item(src_path, dest_path, recursive=False)

        # start the transfer
        data = client.submit_transfer(tdata)
        task_id = data["task_id"]

        logger.info("Globus transfer started with ID %s" % task_id)
        logger.debug("Transferring file %s to %s" % (url, output_path))
        return True

    except Exception as e:
        logger.error('Globus transfer request exception: %s' % bdbag.get_named_exception(e))

    return False
Пример #25
0
 def test_extract_bag_archive_tar(self):
     logger.info(self.getTestHeader('extract bag tar format'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir,
                                         'test-bag.tar'),
                                    temp=True)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #26
0
def authenticate(url):

    for auth in list((entry for entry in KEYCHAIN if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))):
        try:
            if not validate_auth_config(auth):
                continue
            return auth.auth_params.token, auth.auth_params.local_endpoint
        except Exception as e:
            logger.warn("Unhandled exception getting Globus token: %s" % bdbag.get_named_exception(e))

    return None, None
Пример #27
0
 def test_validate_profile_serialization(self):
     logger.info(self.getTestHeader('validate profile serialization'))
     try:
         bag_path = ospj(self.test_archive_dir, 'test-bag.zip')
         bdb.validate_bag_serialization(
             bag_path,
             bag_profile_path=
             'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json'
         )
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #28
0
 def test_update_bag_change_file(self):
     logger.info(self.getTestHeader('update bag change file'))
     try:
         with open(ospj(self.test_bag_dir, 'data', 'README.txt'), 'a') as f:
             f.writelines('Additional data added via unit test.')
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['README.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #29
0
 def test_update_bag_change_file(self):
     logger.info(self.getTestHeader('update bag change file'))
     try:
         with open(ospj(self.test_bag_dir, 'data', 'README.txt'), 'a') as f:
             f.writelines('Additional data added via unit test.')
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['README.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #30
0
 def test_update_bag_change_metadata(self):
     logger.info(self.getTestHeader('update bag change metadata'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir, 'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #31
0
def check_payload_consistency(bag, skip_remote=False, quiet=False):

    only_in_manifests, only_on_fs, only_in_fetch = bag.compare_manifests_with_fs_and_fetch()
    payload_consistent = not only_on_fs
    if not skip_remote:
        updated_remote_files = sorted(bag.remote_entries.keys())
        existing_remote_files = sorted(list(bag.files_to_be_fetched(False)))
        modified_remote_files = list(set(updated_remote_files) - set(existing_remote_files))
        normalized_updated_remote_files = set()
        for filename in updated_remote_files:
            normalized_updated_remote_files.add(os.path.normpath(filename))
        unresolved_manifest_files = list(set(only_in_manifests) - normalized_updated_remote_files)
        if modified_remote_files or only_in_fetch:
            payload_consistent = False
        if unresolved_manifest_files:
            payload_consistent = False
    elif payload_consistent:
        payload_consistent = not only_in_manifests

    for path in only_in_manifests:
        e = bagit.FileMissing(path)
        if not quiet:
            logger.warning(
                "%s. Resolve this file reference or re-run with the \"update\" flag set in order to remove this file "
                "from the manifest." % bdbag.get_named_exception(e))
    for path in only_on_fs:
        e = bagit.UnexpectedFile(path)
        if not quiet:
            logger.warning(
                "%s. Re-run with the \"update\" flag set in order to add this file to the manifest."
                % bdbag.get_named_exception(e))
    if not skip_remote:
        for path in only_in_fetch:
            e = bagit.UnexpectedRemoteFile(path)
            if not quiet:
                logger.warning(
                    "%s. Ensure that any remote file references from fetch.txt are also present in the manifest and "
                    "re-run with the \"update\" flag set in order to apply this change." % bdbag.get_named_exception(e))

    return payload_consistent
Пример #32
0
def get_file(url, output_path, auth_config, headers=None, session=None):

    try:
        if not session:
            session = get_session(url, auth_config)
        output_dir = os.path.dirname(os.path.abspath(output_path))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if not headers:
            headers = HEADERS
        else:
            headers.update(HEADERS)
        logger.info("Attempting GET from URL: %s" % url)
        r = session.get(url,
                        headers=headers,
                        stream=True,
                        verify=certifi.where())
        if r.status_code == 401:
            session = get_session(url, auth_config)
            r = session.get(url,
                            headers=headers,
                            stream=True,
                            verify=certifi.where())
        if r.status_code != 200:
            logger.error('HTTP GET Failed for URL: %s' % url)
            logger.error("Host %s responded:\n\n%s" %
                         (urlsplit(url).netloc, r.text))
            logger.warn('File transfer failed: [%s]' % output_path)
        else:
            total = 0
            start = datetime.datetime.now()
            logger.debug("Transferring file %s to %s" % (url, output_path))
            with open(output_path, 'wb') as data_file:
                for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
                    data_file.write(chunk)
                    total += len(chunk)
            elapsed = datetime.datetime.now() - start
            totalSecs = elapsed.total_seconds()
            totalMBs = total / (1024 * 1024)
            throughput = str(
                "%.3f MB/second" %
                (totalMBs / totalSecs if totalSecs > 0 else 0.001))
            logger.info(
                'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. '
                % (output_path, totalMBs, throughput, elapsed))
            return True

    except requests.exceptions.RequestException as e:
        logger.error('HTTP Request Exception: %s' %
                     (bdbag.get_named_exception(e)))

    return False
Пример #33
0
 def test_update_bag_prune(self):
     logger.info(self.getTestHeader('update bag prune manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir, algs=['md5'], update=True, prune_manifests=True)
         self.assertIsInstance(bag, bagit.Bag)
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt')))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #34
0
 def test_validate_incomplete_bag_full(self):
     logger.info(self.getTestHeader('test full validation incomplete bag'))
     try:
         self.assertRaises(bagit.BagValidationError,
                           bdb.validate_bag,
                           self.test_bag_incomplete_dir,
                           fast=False)
         output = self.stream.getvalue()
         self.assertExpectedMessages([
             'bdbag-profile.json does not exist',
             'minid_v0.1_Nov_2015.pdf does not exist'
         ], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #35
0
 def test_update_bag_change_metadata(self):
     logger.info(self.getTestHeader('update bag change metadata'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir,
                                                'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(
             ['Reading bag metadata from file', 'test-metadata.json'],
             output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #36
0
def authenticate(url, auth_config):

    for auth in list(
        (entry for entry in auth_config
         if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))):
        try:
            if not validate_auth_config(auth):
                continue
            if auth.auth_type == 'token':
                return auth.auth_params.transfer_token, auth.auth_params.local_endpoint
        except Exception as e:
            logger.warn("Unhandled exception getting Globus token: %s" %
                        bdbag.get_named_exception(e))

    return None, None
Пример #37
0
def read_keychain(keychain_file, create_default=True):
    keychain = json.dumps(DEFAULT_KEYCHAIN)
    if keychain_file == DEFAULT_KEYCHAIN_FILE and not os.path.isfile(keychain_file) and create_default:
        logger.debug("No keychain file specified and no default keychain file found, attempting to create one.")
        try:
            create_default_keychain()
        except Exception as e:
            logger.warning(
                "Unable to create default keychain file. A keychain file is required for authentication when "
                "retrieving files from protected remote resources. Either ensure that the default keychain "
                "file %s can be created or provide an a different path to a valid keychain file. Error: %s" %
                (DEFAULT_KEYCHAIN_FILE, bdbag.get_named_exception(e)))
    if os.path.isfile(keychain_file):
        with open(keychain_file) as kf:
            keychain = kf.read()

    return json.loads(keychain, object_hook=lambda d: collections.namedtuple('Auth', d.keys())(*d.values()))
Пример #38
0
def read_config(config_file, create_default=True):
    config = json.dumps(bdbag.DEFAULT_CONFIG)
    if config_file == bdbag.DEFAULT_CONFIG_FILE and not os.path.isfile(
            config_file) and create_default:
        logger.debug(
            "No default configuration file found, attempting to create one.")
        try:
            create_default_config()
        except Exception as e:
            logger.debug(
                "Unable to create default configuration file %s. Using internal defaults. %s"
                % (bdbag.DEFAULT_CONFIG_FILE, bdbag.get_named_exception(e)))
    if os.path.isfile(config_file):
        with open(config_file) as cf:
            config = cf.read()

    return json.loads(config, object_pairs_hook=OrderedDict)
Пример #39
0
 def test_update_bag_change_metadata_only(self):
     logger.info(self.getTestHeader('update bag change metadata only - do not save manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            save_manifests=False,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir, 'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output)
         self.assertUnexpectedMessages(['updating manifest-sha1.txt',
                                        'updating manifest-sha256.txt',
                                        'updating manifest-sha512.txt',
                                        'updating manifest-md5.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #40
0
 def test_create_bag_with_config(self):
     logger.info(self.getTestHeader('create bag with config'))
     try:
         bag = bdb.make_bag(self.test_data_dir,
                            config_file=(ospj(self.test_config_dir, 'test-config.json')))
         self.assertIsInstance(bag, bagit.Bag)
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha512.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt')))
         baginfo = ospj(self.test_data_dir, 'bag-info.txt')
         with open(baginfo) as bi:
             baginfo_txt = bi.read()
         self.assertIn('Contact-Name: bdbag test', baginfo_txt)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #41
0
def get_session(url):

    session = None
    response = None

    for auth in list((entry for entry in KEYCHAIN if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))):

        try:
            if not validate_auth_config(auth):
                continue

            if not auth.auth_uri:
                continue

            if auth.uri in SESSIONS:
                session = SESSIONS[auth.uri]
                break
            else:
                session = requests.session()

            if auth.auth_type == 'http-basic':
                session.auth = (auth.auth_params.username, auth.auth_params.password)
                auth_method = auth.auth_method.lower()
                if auth_method == 'post':
                    response = session.post(auth.auth_uri, auth=session.auth)
                elif auth_method == 'get':
                    response = session.get(auth.auth_uri, auth=session.auth)
            elif auth.auth_type == 'http-form':
                response = session.post(auth.auth_uri,
                                        {auth.auth_params.username_field: auth.auth_params.username,
                                         auth.auth_params.password_field: auth.auth_params.password})
            if response.status_code > 203:
                logger.warn('Authentication failed with Status Code: %s %s\n' % (response.status_code, response.text))
            else:
                logger.info("Session established: %s", auth.auth_uri)
                SESSIONS[auth.auth_uri] = session
                break

        except Exception as e:
            logger.warn("Unhandled exception during HTTP(S) authentication: %s" % bdbag.get_named_exception(e))

    return session if session else requests.session()
Пример #42
0
 def test_update_bag_remote(self):
     logger.info(self.getTestHeader('update bag add remote file manifest'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json'))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['Generating remote file references from', 'test-fetch-manifest.json'], output)
         fetch_file = ospj(self.test_bag_dir, 'fetch.txt')
         self.assertTrue(ospif(fetch_file))
         with open(fetch_file) as ff:
             fetch_txt = ff.read()
         self.assertIn(
             'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json'
             '\t723\tdata/bdbag-profile.json', fetch_txt)
         self.assertIn(
             'ark:/88120/r8059v\t632860\tdata/minid_v0.1_Nov_2015.pdf', fetch_txt)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #43
0
 def test_update_bag_change_metadata_only(self):
     logger.info(
         self.getTestHeader(
             'update bag change metadata only - do not save manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            save_manifests=False,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir,
                                                'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(
             ['Reading bag metadata from file', 'test-metadata.json'],
             output)
         self.assertUnexpectedMessages([
             'updating manifest-sha1.txt', 'updating manifest-sha256.txt',
             'updating manifest-sha512.txt', 'updating manifest-md5.txt'
         ], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #44
0
 def test_update_bag_prune(self):
     logger.info(self.getTestHeader('update bag prune manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            algs=['md5'],
                            update=True,
                            prune_manifests=True)
         self.assertIsInstance(bag, bagit.Bag)
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(
             ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt')))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #45
0
def validate_bag(bag_path, fast=False, config_file=bdbag.DEFAULT_CONFIG_FILE):
    config = read_config(config_file)
    bag_config = config['bag_config']
    bag_processes = bag_config.get('bag_processes', 1)

    try:
        logger.info("Validating bag: %s" % bag_path)
        bag = bagit.Bag(bag_path)
        bag.validate(bag_processes, fast=fast)
        logger.info("Bag %s is valid" % bag_path)
    except bagit.BagIncompleteError as e:
        logger.warning("BagIncompleteError: %s %s", e,
                       "This validation error may be transient if the bag contains unresolved remote file references "
                       "from a fetch.txt file. In this case the bag is incomplete but not necessarily invalid. "
                       "Resolve remote file references (if any) and re-validate.")
        raise e
    except bagit.BagValidationError as e:
        errors = list()
        for d in e.details:
            errors.append(bdbag.get_named_exception(d))
        raise bagit.BagValidationError('\nError: '.join(errors))
    except Exception as e:
        raise RuntimeError("Unhandled exception while validating bag: %s" % e)
Пример #46
0
 def test_validate_complete_bag_fast(self):
     logger.info(self.getTestHeader('test fast validation complete bag'))
     try:
         bdb.validate_bag(self.test_bag_dir, fast=True)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #47
0
 def test_validate_incomplete_bag_fast(self):
     logger.info(self.getTestHeader('test fast validation incomplete bag'))
     try:
         self.assertRaises(bagit.BagIncompleteError,  bdb.validate_bag, self.test_bag_incomplete_dir, fast=True)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Пример #48
0
def get_session(url, auth_config):

    session = None
    response = None

    for auth in list(
        (entry for entry in auth_config
         if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))):

        try:
            if not validate_auth_config(auth):
                continue

            if auth.uri in SESSIONS:
                session = SESSIONS[auth.uri]
                break
            else:
                session = get_new_session()

            if auth.auth_type == 'cookie':
                if auth.auth_params and hasattr(auth.auth_params, 'cookies'):
                    cookies = auth.auth_params.cookies
                    for cookie in cookies:
                        name, value = cookie.split('=', 1)
                        session.cookies.set(name,
                                            value,
                                            domain=urlsplit(auth.uri).hostname,
                                            path='/')
                    SESSIONS[auth.uri] = session
                    break

            # if we get here the assumption is that the auth_type is either http-basic or http-form
            if not keychain.has_auth_attr(auth, 'auth_uri'):
                logging.warning(
                    "Missing required parameter [auth_uri] for auth_type [%s] for keychain entry [%s]"
                    % (auth.auth_type, auth.uri))
                continue

            if not (keychain.has_auth_attr(auth.auth_params, 'username')
                    and keychain.has_auth_attr(auth.auth_params, 'password')):
                logging.warning(
                    "Missing required parameters [username, password] for auth_type [%s] for keychain entry [%s]"
                    % (auth.auth_type, auth.uri))
                continue

            if auth.auth_type == 'http-basic':
                session.auth = (auth.auth_params.username,
                                auth.auth_params.password)
                auth_method = "post"
                if keychain.has_auth_attr(auth.auth_params, 'auth_method'):
                    auth_method = auth.auth_params.auth_method.lower()
                if auth_method == 'post':
                    response = session.post(auth.auth_uri, auth=session.auth)
                elif auth_method == 'get':
                    response = session.get(auth.auth_uri, auth=session.auth)
                else:
                    logging.warning(
                        "Unsupported auth_method [%s] for auth_type [%s] for keychain entry [%s]"
                        % (auth_method, auth.auth_type, auth.uri))
            elif auth.auth_type == 'http-form':
                response = session.post(
                    auth.auth_uri, {
                        auth.auth_params.username_field or "username":
                        auth.auth_params.username,
                        auth.auth_params.password_field or "password":
                        auth.auth_params.password
                    })
            if response.status_code > 203:
                logger.warning(
                    'Authentication failed with Status Code: %s %s\n' %
                    (response.status_code, response.text))
            else:
                logger.info("Session established: %s", auth.auth_uri)
                SESSIONS[auth.auth_uri] = session
                break

        except Exception as e:
            logger.warning(
                "Unhandled exception during HTTP(S) authentication: %s" %
                bdbag.get_named_exception(e))

    if not session:
        url_parts = urlsplit(url)
        base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc))
        session = SESSIONS.get(base_url, None)
        if not session:
            session = get_new_session()
            SESSIONS[base_url] = session

    return session
Пример #49
0
def main():

    sys.stderr.write('\n')

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.bag_path)

    archive = None
    temp_path = None
    error = None
    result = 0

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not (
                (args.validate or args.validate_profile or args.resolve_fetch)
                    and not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path, args.checksum, args.update,
                             args.skip_manifests, args.prune_manifests,
                             BAG_METADATA if BAG_METADATA else None,
                             args.metadata_file, args.remote_file_manifest,
                             args.config_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile
                  or args.resolve_fetch):
            bdb.extract_bag(path)
            sys.stderr.write('\n')
            return result

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(
                path,
                force=True if args.resolve_fetch == 'all' else False,
                keychain_file=args.keychain_file)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            bdb.validate_bag(temp_path if temp_path else path,
                             fast=True if args.validate == 'fast' else False,
                             config_file=args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(
                temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path,
                                           profile)

        if args.revert:
            bdb.revert_bag(path)

    except Exception as e:
        result = 1
        error = "Error: %s" % bdbag.get_named_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    sys.stderr.write('\n')

    return result
Пример #50
0
def main():

    sys.stderr.write('\n')

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.bag_path)

    archive = None
    temp_path = None
    error = None
    result = 0

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not ((args.validate or args.validate_profile or args.resolve_fetch) and
                    not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path,
                             args.checksum,
                             args.update,
                             args.skip_manifests,
                             args.prune_manifests,
                             BAG_METADATA if BAG_METADATA else None,
                             args.metadata_file,
                             args.remote_file_manifest,
                             args.config_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile or args.resolve_fetch):
            bdb.extract_bag(path)
            sys.stderr.write('\n')
            return result

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            bdb.validate_bag(temp_path if temp_path else path,
                             True if args.validate == 'fast' else False,
                             args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path, profile)

    except Exception as e:
        result = 1
        error = "Error: %s" % bdbag.get_named_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    sys.stderr.write('\n')

    return result
Пример #51
0
 def test_validate_complete_bag_fast(self):
     logger.info(self.getTestHeader('test fast validation complete bag'))
     try:
         bdb.validate_bag(self.test_bag_dir, fast=True)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))