def from_uri(uri): """Get provider type based on uri.""" is_doi_ = is_doi(uri) if is_doi_ is None: url = urlparse(uri) if bool(url.scheme and url.netloc and url.params == '') is False: return None, 'Cannot parse URL.' provider = None warning = '' for _, potential_provider in ProviderFactory.PROVIDERS.items(): try: if potential_provider.supports(uri): provider = potential_provider break except (Exception, BaseException) as e: warning += 'Couldn\'t test provider {prov}: {err}\n'.format( prov=potential_provider, err=e) supported_providers = ', '.join(ProviderFactory.PROVIDERS.keys()) if is_doi_ and provider is None: return None, ( warning + 'Provider {} not found. '.format( uri.split('/')[1].split('.')[0] # Get DOI provider name. ) + 'Currently supporting following providers: {}'.format( supported_providers)) elif provider is None: return None, (warning + 'Provider not found for {}. '.format(uri) + 'Currently supporting following providers: {}'. format(supported_providers)) else: return provider(is_doi=is_doi_), warning
def _migrate_doi_identifier(data, client): """If the dataset _id is doi, make it a UUID.""" from renku.core.utils.doi import is_doi from renku.core.utils.uuid import is_uuid _id = data.get('_id', '') identifier = data.get('identifier', '') if not is_uuid(_id): if not is_uuid(identifier): data['identifier'] = str(uuid.uuid4()) if is_doi(data.get('_id', '')): data['same_as'] = {'@type': ['schema:URL'], 'url': data['_id']} if data.get('@context'): data['@context'].setdefault( 'same_as', { '@id': 'schema:sameAs', '@type': 'schema:URL', '@context': { '@version': '1.1', 'url': 'schema:url', 'schema': 'http://schema.org/' } }) data['_id'] = data['identifier'] return data
def _migrate_doi_identifier(data, client): """If the dataset _id is doi, make it a UUID.""" from renku.core.utils.doi import is_doi from renku.core.utils.uuid import is_uuid _id = data.get("_id", "") identifier = data.get("identifier", "") if not is_uuid(_id): if not is_uuid(identifier): data["identifier"] = str(uuid.uuid4()) if is_doi(data.get("_id", "")): data["same_as"] = {"@type": ["schema:URL"], "url": data["_id"]} if data.get("@context"): data["@context"].setdefault( "same_as", { "@id": "schema:sameAs", "@type": "schema:URL", "@context": { "@version": "1.1", "url": "schema:url", "schema": "http://schema.org/" }, }, ) data["_id"] = data["identifier"] return data
def supports(uri): """Check if provider supports a given uri.""" is_doi_ = is_doi(uri) is_dataverse_uri = is_doi_ is None and check_dataverse_uri(uri) is_dataverse_doi = is_doi_ and check_dataverse_doi(is_doi_.group(0)) return is_dataverse_uri or is_dataverse_doi
def migrate_doi_identifier(data): """If the dataset has a doi, make identifier be based on it.""" from renku.core.utils.doi import is_doi, extract_doi if is_doi(data.get('_id', '')): data['identifier'] = extract_doi(data.get('_id')) data['same_as'] = data['_id'] if data.get('@context'): data['@context'].setdefault('same_as', 'schema:sameAs') return data
def test_doi_migration(dataset_metadata): """Test migration of id with doi.""" dataset = Dataset.from_jsonld( dataset_metadata, client=LocalClient('.'), ) assert is_doi(dataset.identifier) assert urljoin('https://localhost', 'datasets/' + quote(dataset.identifier, safe='')) == dataset._id assert dataset.same_as == urljoin('https://doi.org', dataset.identifier)
def test_dataset_doi_metadata(dataset_metadata): """Check dataset metadata for correct DOI.""" from renku.core.utils.doi import is_doi dataset = Dataset.from_jsonld( dataset_metadata, client=LocalClient('.'), ) if is_doi(dataset.identifier): assert urljoin('https://doi.org', dataset.identifier) == dataset.same_as assert dataset._id.endswith('datasets/{}'.format( quote(dataset.identifier, safe='')))
def update_metadata(self, other_dataset): """Updates instance attributes with other dataset attributes. :param other_dataset: `Dataset` :return: self """ if is_doi(other_dataset.identifier): self.same_as = urllib.parse.urljoin('https://doi.org', other_dataset.identifier) for field_ in self.EDITABLE_FIELDS: val = getattr(other_dataset, field_) if val: setattr(self, field_, val) return self
def from_uri(uri): """Get provider type based on uri.""" is_doi_ = is_doi(uri) if is_doi_ is None: url = urlparse(uri) if bool(url.scheme and url.netloc and url.params == "") is False: return None, "Cannot parse URL." provider = None warning = "" for _, potential_provider in ProviderFactory.PROVIDERS.items(): try: if potential_provider.supports(uri): provider = potential_provider break except (Exception, BaseException) as e: warning += "Couldn't test provider {prov}: {err}\n".format( prov=potential_provider, err=e) supported_providers = ", ".join(ProviderFactory.PROVIDERS.keys()) if is_doi_ and provider is None: return ( None, ( warning + "Reason: provider {} not found".format( uri.split("/")[1].split(".") [0]) # Get DOI provider name. + "\nHint: Supported providers are: {}".format( supported_providers)), ) elif provider is None: return ( None, (warning + "Reason: provider not found for {} ".format(uri) + "\nHint: Supported providers are: {}".format( supported_providers)), ) else: return provider(is_doi=is_doi_), warning
def short_id(self): """Shorter version of identifier.""" if is_doi(self.identifier): return self.identifier return str(self.uid)[:8]
def uid(self): """UUID part of identifier.""" if is_doi(self.identifier): return self.identifier return self.identifier.split('/')[-1]
def _extract_doi(value): """Return either a string or the doi part of a URL.""" value = str(value) if is_doi(value): return extract_doi(value) return value
def supports(uri): """Whether or not this provider supports a given uri.""" return bool(is_doi(uri))
def supports(uri): """Whether or not this provider supports a given uri.""" if is_doi(uri) is not None: return True return False
def import_dataset( client, uri, short_name='', extract=False, with_prompt=False, yes=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider or another renku project.""" provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError('Could not process {0}.\n{1}'.format(uri, err)) try: record = provider.find_record(uri, client) dataset = record.as_dataset(client) files = dataset.files total_size = 0 if with_prompt and not yes: click.echo( tabulate( files, headers=OrderedDict(( ('checksum', None), ('filename', 'name'), ('size_in_mb', 'size (mb)'), ('filetype', 'type'), )), floatfmt='.2f' ) ) text_prompt = 'Do you wish to download this version?' if record.is_last_version(uri) is False: text_prompt = WARNING + 'Newer version found at {}\n'.format( record.links.get('latest_html') ) + text_prompt click.confirm(text_prompt, abort=True) for file_ in files: if file_.size_in_mb is not None: total_size += file_.size_in_mb total_size *= 2**20 except KeyError as e: raise ParameterError(( 'Could not process {0}.\n' 'Unable to fetch metadata due to {1}'.format(uri, e) )) except LookupError as e: raise ParameterError( ('Could not process {0}.\n' 'Reason: {1}'.format(uri, str(e))) ) if not files: raise ParameterError('Dataset {} has no files.'.format(uri)) dataset.same_as = Url(url_id=remove_credentials(uri)) if not provider.is_git_based: if not short_name: short_name = generate_default_short_name( dataset.name, dataset.version ) if is_doi(dataset.identifier): dataset.same_as = Url( url_str=urllib.parse. urljoin('https://doi.org', dataset.identifier) ) urls, names = zip(*[(f.url, f.filename) for f in files]) _add_to_dataset( client, urls=urls, short_name=short_name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, destination_names=names, progress=progress, interactive=with_prompt, total_size=total_size, ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( client, short_name, tag_name, 'Tag {} created by renku import'.format(dataset.version) ) else: short_name = short_name or dataset.short_name _add_to_dataset( client, urls=[record.project_url], short_name=short_name, sources=[f.path for f in files], with_metadata=dataset, create=True )
def import_dataset( client, uri, name="", extract=False, with_prompt=False, yes=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider or another renku project.""" u = urllib.parse.urlparse(uri) if u.scheme not in ("", "file", "git+https", "git+ssh", "doi"): # NOTE: Check if the url is a redirect. uri = requests.head(uri, allow_redirects=True).url provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError("Could not process {0}.\n{1}".format(uri, err)) try: record = provider.find_record(uri, client) dataset = record.as_dataset(client) files = dataset.files total_size = 0 if with_prompt and not yes: click.echo( tabulate( files, headers=OrderedDict(( ("checksum", None), ("filename", "name"), ("size_in_mb", "size (mb)"), ("filetype", "type"), )), floatfmt=".2f", )) text_prompt = "Do you wish to download this version?" if record.is_last_version(uri) is False: text_prompt = (WARNING + "Newer version found at {}\n".format( record.links.get("latest_html")) + text_prompt) click.confirm(text_prompt, abort=True) for file_ in files: if file_.size_in_mb is not None: total_size += file_.size_in_mb total_size *= 2**20 except KeyError as e: raise ParameterError( ("Could not process {0}.\n" "Unable to fetch metadata due to {1}".format(uri, e))) except LookupError as e: raise ParameterError(("Could not process {0}.\n" "Reason: {1}".format(uri, str(e)))) if not files: raise ParameterError("Dataset {} has no files.".format(uri)) dataset.same_as = Url(url_id=remove_credentials(uri)) if not provider.is_git_based: if not name: name = generate_default_name(dataset.title, dataset.version) if is_doi(dataset.identifier): dataset.same_as = Url(url_str=urllib.parse.urljoin( "https://doi.org", dataset.identifier)) urls, names = zip(*[(f.source, f.filename) for f in files]) _add_to_dataset( client, urls=urls, name=name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, destination_names=names, progress=progress, interactive=with_prompt, total_size=total_size, ) if dataset.version: tag_name = re.sub("[^a-zA-Z0-9.-_]", "_", dataset.version) tag_dataset( client, name, tag_name, "Tag {} created by renku import".format(dataset.version)) else: name = name or dataset.name if not dataset.data_dir: raise OperationError( f"Data directory for dataset must be set: {dataset.name}") sources = [f"{dataset.data_dir}/**"] for file_ in dataset.files: try: Path(file_.path).relative_to(dataset.data_dir) except ValueError: # Files that are not in dataset's data directory sources.append(file_.path) _add_to_dataset( client, urls=[record.project_url], name=name, sources=sources, with_metadata=dataset, create=True, )