def __init__(self): if not self.dir: self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent self.dir = DataDir(self.dir) md = self.dir / 'metadata.json' self.metadata = self.metadata_cls.from_file( md) if md.exists() else self.metadata_cls() self.metadata.id = self.id
class Dataset(object): """ A cldfbench dataset ties together - `raw` data, to be used as source for the - `cldf` data, which is created using config data from - `etc`. To use the cldfbench infrastructure, one should sub-class `Dataset`. cldfbench supports the following workflow: - a `download` command populates a `Dataset`'s `raw` directory. - a `makecldf` command (re)creates the CLDF dataset in `cldf`. """ dir = None id = None metadata_cls = Metadata def __init__(self): if not self.dir: self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent self.dir = DataDir(self.dir) md = self.dir / 'metadata.json' self.metadata = self.metadata_cls.from_file( md) if md.exists() else self.metadata_cls() self.metadata.id = self.id def __str__(self): return '{0.__class__.__name__} "{0.id}" at {1}'.format( self, self.dir.resolve()) def cldf_specs(self): """ A `Dataset` must declare all CLDF datasets that are derived from it. :return: A single `CLDFSpec` instance, or a `dict`, mapping names to `CLDFSpec` \ instances, where the name will be used by `cldf_reader`/`cldf_writer` to look up \ the spec. """ return CLDFSpec(dir=self.cldf_dir) @property def cldf_specs_dict(self): """ Turn cldf_specs into a `dict` for simpler lookup. :return: `dict` mapping lookup keys to `CLDFSpec` instances. """ specs = self.cldf_specs() if isinstance(specs, CLDFSpec): return {None: specs} assert isinstance(specs, dict) return specs @lazyproperty def cldf_dir(self): return self.dir / 'cldf' @lazyproperty def raw_dir(self): return self.dir / 'raw' @lazyproperty def etc_dir(self): return self.dir / 'etc' def cldf_writer(self, args, cldf_spec=None, clean=True): """ :param args: :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs` :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. \ Note that `False` must be passed for subsequent calls to `cldf_writer` in case the \ spec re-uses a directory. :return: a `cldf_spec.writer_cls` instance, for write-access to CLDF data. \ This method should be used in a with-statement, and will then return a `CLDFWriter` with \ an empty working directory. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_writer(args=args, dataset=self, clean=clean) def cldf_reader(self, cldf_spec=None): """ :param cldf_spec: :return: a `pycldf.Dataset` instance, for read-access to the CLDF data. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_dataset() @lazyproperty def repo(self): try: return Repository(self.dir) except ValueError: # pragma: no cover return # # Workflow commands are implemented with two methods for each command: # - cmd_<command>: The implementation of the command, typically overwritten by datasets. # - _cmd_<command>: An (optional) wrapper providing setup and teardown functionality, calling # cmd_<command> in between. # # Workflow commands must accept an `argparse.Namespace` as sole positional argument. # def _cmd_download(self, args): self.raw_dir.mkdir(exist_ok=True) self.cmd_download(args) (self.raw_dir / 'README.md').write_text( 'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()), encoding='utf8') def cmd_download(self, args): args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'download', self.id)) return NOOP def _cmd_readme(self, args): if self.metadata: self.dir.joinpath('README.md').write_text(self.cmd_readme(args), encoding='utf8') def cmd_readme(self, args): return self.metadata.markdown() if self.metadata else '' def _cmd_makecldf(self, args): specs = list(self.cldf_specs_dict.values()) if len(specs) == 1: # There's only one CLDF spec! We instantiate the writer now and inject it into `args`: with self.cldf_writer(args, cldf_spec=specs[0]) as writer: args.writer = writer self.cmd_makecldf(args) else: self.cmd_makecldf(args) if self.metadata and self.metadata.known_license: legalcode = self.metadata.known_license.legalcode if legalcode: (self.dir / 'LICENSE').write_text(legalcode, encoding='utf8') def cmd_makecldf(self, args): """ :param args: An `argparse.Namespace` including attributes: - `writer`: `CLDFWriter` instance """ args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'makecldf', self.id)) return NOOP
def run(args): ds = get_dataset(args) ds_cldf = ds.cldf_reader() release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA) if ds_cldf.get('media.csv', None) is None: # pragma: no cover args.log.error('Dataset has no media.csv') raise ParserError if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi): args.log.error('Invalid passed DOI') raise ParserError if args.update_zenodo: if not release_dir.exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir)) raise ParserError if not (release_dir / ZENODO_FILE_NAME).exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir / ZENODO_FILE_NAME)) raise ParserError if args.create_release: args.log.error( 'You cannot create the release and update zenodo at the same time.' ) raise ParserError if args.create_release: if not args.parent_doi: args.log.error( 'The corresponding DOI is required (via --parent-doi).') raise ParserError mime_types = None if args.mimetype: mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] if args.list: size = collections.Counter() number = collections.Counter() else: media_dir = args.out / MEDIA media_dir.mkdir(exist_ok=True) media = [] if not args.update_zenodo: used_file_extensions = set() with UnicodeWriter(media_dir / INDEX_CSV if not args.list else None) as w: for i, row in enumerate( tqdm.tqdm([r for r in ds_cldf['media.csv']], desc='Getting {0} items'.format(MEDIA))): url = ds_cldf.get_row_url('media.csv', row) if isinstance(url, rfc3986.URIReference): url = url.normalize().unsplit() row['URL'] = url f_ext = url.split('.')[-1].lower() if args.debug and i > 500: break if (mime_types is None) or f_ext in mime_types\ or any(row['mimetype'].startswith(x) for x in mime_types): if args.list: m = '{0} ({1})'.format(row['mimetype'], f_ext) size[m] += int(row['size']) number.update([m]) else: used_file_extensions.add(f_ext.lower()) d = media_dir / row['ID'][:2] d.mkdir(exist_ok=True) fn = '.'.join([row['ID'], f_ext]) target = d / fn row['local_path'] = pathlib.Path(row['ID'][:2]) / fn if i == 0: w.writerow(row) w.writerow(row.values()) media.append(target) if (not target.exists()) or md5(target) != row['ID']: _create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k.ljust(20), str(number[k]), format_size(v)])) return # Waiting for the download threads to finish if 'download_threads' in globals(): for t in download_threads: t.join() if args.create_release: assert media_dir.exists(), 'No folder "{0}" found in {1}'.format( MEDIA, media_dir.resolve()) release_dir.mkdir(exist_ok=True) media.append(media_dir / INDEX_CSV) try: zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)), 'w', zipfile.ZIP_DEFLATED) fp = args.out for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)): zipf.write(str(f), str(os.path.relpath(str(f), str(fp)))) zipf.close() except Exception as e: args.log.error(e) raise def _contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } version_v = git_describe('.').split('-')[0] version = version_v.replace('v', '') git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '') with jsonlib.update(release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md: contribs = ds.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [_contrib(p) for p in creators] if contributors: md['contributors'] = [_contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] + \ COMMUNITIES if communities and not args.debug: md['communities'] = [{ "identifier": community_id } for community_id in sorted(set(communities))] md.update({ 'title': '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'dataset', 'publication_date': datetime.today().strftime('%Y-%m-%d'), 'version': version, 'related_identifiers': [ { 'scheme': 'url', 'identifier': '{0}/tree/{1}'.format(git_url, version_v), 'relation': 'isSupplementTo' }, ], }) if args.parent_doi: md['related_identifiers'].append({ 'scheme': 'doi', 'identifier': args.parent_doi, 'relation': 'isPartOf' }) supplement_to = " - Supplement to dataset " \ "<a href='https://doi.org/{0}'>{1}</a> ".format( args.parent_doi, ds.metadata.title) # noqa: E122 if ds.metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': ds.metadata.url, 'relation': 'isAlternateIdentifier' }) formats = ', '.join(sorted(used_file_extensions)) descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else '' online_url, online = '', '' if ds.metadata.url: online_url = ds.metadata.url online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format( online_url) md['description'] = html.escape( DESCRIPTION.format( url=online_url, formats=' ({0})'.format(formats) if formats else '', title=md['title'], supplement_to=supplement_to, descr=descr, online=online)) license_md = '' if ds.metadata.zenodo_license: md['license'] = {'id': ds.metadata.zenodo_license} license_md = LICENCE.format(ds.metadata.zenodo_license) DataDir(release_dir).write( 'README.md', README.format( title=md['title'], doi='https://doi.org/{0}'.format(args.parent_doi), ds_title=ds.metadata.title, license=license_md, formats=' ({0})'.format(formats) if formats else '', media=MEDIA, index=INDEX_CSV)) if args.update_zenodo: md = {} md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME)) if args.debug: api_url = API_URL_SANDBOX access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN') else: api_url = API_URL access_token = ACCESS_TOKEN zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=access_token) try: rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) except Exception as e: args.log.error( 'Check connection and credentials for accessing Zenodo.\n{0}'. format(e)) return latest_version = rec.links['latest'].split('/')[-1] if latest_version != args.update_zenodo: args.log.warn( 'Passed deposit ID does not refer to latest version {0}!'. format(latest_version)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Version: ' + rec.metadata.version) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def run(args): ds = Dataset().cldf_reader() release_dir = args.out / '{0}_audio'.format(Dataset().id) zenodo_file_name = 'zenodo.json' if args.list: size = collections.Counter() number = collections.Counter() else: f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']} audio = args.out / 'audio' audio.mkdir(exist_ok=True) if not args.update_zenodo: for row in tqdm.tqdm([r for r in ds['media.csv']]): if args.list: size[row['mimetype']] += int(row['size']) number.update([row['mimetype']]) else: d = audio / f2c[row['Form_ID']] d.mkdir(exist_ok=True) url = ds.get_row_url('media.csv', row) target = d / '{}.{}'.format(row['ID'], url.split('.')[-1]) if (not target.exists()) or md5(target) != row['ID']: if (args.mimetype is None) or target.suffix.endswith( args.mimetype): create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k, str(number[k]), format_size(v)])) if args.create_release: assert audio.exists(), 'No folder "audio" found in {0}'.format( audio.resolve()) release_dir.mkdir(exist_ok=True) args.log.info('creating audio ZIP archive per parameter folder ...') try: zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w', zipfile.ZIP_DEFLATED) fp = args.out for root, dirs, files in tqdm.tqdm(os.walk(audio)): for f in files: if not f.startswith('.') and not f.startswith('__')\ and ((args.mimetype is None) or f.endswith(args.mimetype)): zipf.write(os.path.join(root, f), os.path.relpath(os.path.join(root, f), fp)) zipf.close() except Exception as e: args.log.error(e) raise def contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } with jsonlib.update(release_dir / zenodo_file_name, indent=4, default=collections.OrderedDict()) as md: contribs = Dataset().dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md['contributors'] = [contrib(p) for p in contributors] if COMMUNITIES: md['communities'] = [{ 'id': community_id } for community_id in COMMUNITIES] md.update({ 'title': '{0} Audio Files'.format(Dataset().metadata.title), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'video', 'version': VERSION, 'related_identifiers': [ { 'scheme': 'doi', 'identifier': '10.5281/zenodo.4309141', 'relation': 'isPartOf' }, { 'scheme': 'url', 'identifier': '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX, Dataset().id, VERSION), 'relation': 'isSupplementTo' }, ], }) if Dataset().metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': Dataset().metadata.url, 'relation': 'isAlternateIdentifier' }) md['description'] = html.escape( DESCRIPTION.format( GITHUB_PREFIX, Dataset().id, Dataset().metadata.url if Dataset().metadata.url else '', VERSION)) license_md = '' if Dataset().metadata.zenodo_license: md['license'] = {'id': Dataset().metadata.zenodo_license} license_md = LISENCE.format(Dataset().metadata.zenodo_license) DataDir(release_dir).write( 'README.md', RELEASE_NOTE.format(md['title'], GITHUB_PREFIX, Dataset().id, Dataset().metadata.title, license_md)) if args.update_zenodo: assert release_dir.exists() assert (release_dir / zenodo_file_name).exists() md = {} md.update(jsonlib.load(release_dir / zenodo_file_name)) api_url = API_URL zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN) rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != zenodoclient.models.PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
class Dataset(object): """ A cldfbench dataset ties together - `raw` data, to be used as source for the - `cldf` data, which is created using config data from - `etc`. To use the cldfbench infrastructure, one should sub-class `Dataset`. cldfbench supports the following workflow: - a `download` command populates a `Dataset`'s `raw` directory. - a `makecldf` command (re)creates the CLDF dataset in `cldf`. The following class attributes are supposed to be overwritten by subclasses: :ivar dir: `pathlib.Path` pointing to the root directory of the dataset. :ivar id: A `str` identifier for the dataset. No assumption about uniqueness properties of \ this identifier is made. :ivar metadata_cls: Subclass of :class:`Metadata` (or :class:`Metadata` if not overwritten) """ dir = None id = None metadata_cls = Metadata def __init__(self): if not self.dir: self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent self.dir = DataDir(self.dir) md = self.dir / 'metadata.json' self.metadata = self.metadata_cls.from_file( md) if md.exists() else self.metadata_cls() self.metadata.id = self.id def __str__(self): return '{0.__class__.__name__} "{0.id}" at {1}'.format( self, self.dir.resolve()) @lazyproperty def cldf_dir(self) -> DataDir: """ Directory where CLDF data generated from the Dataset will be stored (unless specified differently by a :class:`CLDFSpec`). """ return self.dir / 'cldf' @lazyproperty def raw_dir(self) -> DataDir: """ Directory where cldfbench expects the raw or source data. """ return self.dir / 'raw' @lazyproperty def etc_dir(self) -> DataDir: """ Directory where cldfbench expects additional configuration or metadata. """ return self.dir / 'etc' def cldf_specs(self) -> typing.Union[CLDFSpec, typing.Dict[str, CLDFSpec]]: """ A `Dataset` must declare all CLDF datasets that are derived from it. :return: A single :class:`CLDFSpec` instance, or a `dict`, mapping names to `CLDFSpec` \ instances, where the name will be used by `cldf_reader`/`cldf_writer` to look up \ the spec. """ return CLDFSpec(dir=self.cldf_dir) @property def cldf_specs_dict( self) -> typing.Dict[typing.Union[str, None], CLDFSpec]: """ Turn :meth:`cldf_specs` into a `dict` for simpler lookup. :return: `dict` mapping lookup keys to `CLDFSpec` instances. """ specs = self.cldf_specs() if isinstance(specs, CLDFSpec): return {None: specs} assert isinstance(specs, dict) return specs def update_submodules(self): """ Convenience method to be used in a `Dataset`'s `cmd_download` to update raw data curated as git submodules. """ subprocess.check_call('git -C {} submodule update --remote'.format( self.dir.resolve()), shell=True) def cldf_writer(self, args, cldf_spec=None, clean=True) -> CLDFWriter: """ :param args: :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs` :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. \ Note that `False` must be passed for subsequent calls to `cldf_writer` in case the \ spec re-uses a directory. :return: a `cldf_spec.writer_cls` instance, for write-access to CLDF data. \ This method should be used in a with-statement, and will then return a `CLDFWriter` with \ an empty working directory. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_writer(args=args, dataset=self, clean=clean) def cldf_reader(self, cldf_spec: typing.Union[str, None] = None) -> pycldf.Dataset: """ :param cldf_spec: :return: a `pycldf.Dataset` instance, for read-access to the CLDF data. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_dataset() @lazyproperty def repo(self) -> typing.Union[Repository, None]: """ The git repository cloned to the dataset's directory (or `None`). """ try: return Repository(self.dir) except ValueError: # pragma: no cover return def _cmd_download(self, args): self.raw_dir.mkdir(exist_ok=True) self.cmd_download(args) (self.raw_dir / 'README.md').write_text( 'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()), encoding='utf8') def cmd_download(self, args: argparse.Namespace): """ Implementations of this methods should populate the dataset's `raw_dir` with the source data. """ args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'download', self.id)) return NOOP def _cmd_readme(self, args): if self.metadata: badge = build_status_badge(self) md = self.cmd_readme(args) if badge: lines, title_found = [], False for line in md.split('\n'): lines.append(line) if line.startswith('# ') and not title_found: title_found = True lines.extend(['', badge]) md = '\n'.join(lines) section = [ '\n\n## CLDF Datasets\n', 'The following CLDF datasets are available in [{0}]({0}):\n'. format(self.cldf_dir.resolve().relative_to(self.dir.resolve())) ] for ds in self.cldf_specs_dict.values(): if ds.metadata_path.exists(): p = ds.metadata_path.resolve().relative_to( self.dir.resolve()) section.append( '- CLDF [{0}](https://github.com/cldf/cldf/tree/master/modules/{0}) ' 'at [{1}]({1})'.format(ds.module, p)) self.dir.joinpath('README.md').write_text(md + '\n'.join(section), encoding='utf8') def cmd_readme(self, args: argparse.Namespace) -> str: """ Implementations of this method should create the content for the dataset's README.md and return it as markdown formatted string. """ return self.metadata.markdown() if self.metadata else '' def _cmd_makecldf(self, args): specs = list(self.cldf_specs_dict.values()) if len(specs) == 1: # There's only one CLDF spec! We instantiate the writer now and inject it into `args`: with self.cldf_writer(args, cldf_spec=specs[0]) as writer: args.writer = writer self.cmd_makecldf(args) else: self.cmd_makecldf(args) if self.metadata and self.metadata.known_license: legalcode = self.metadata.known_license.legalcode if legalcode: (self.dir / 'LICENSE').write_text(legalcode, encoding='utf8') def cmd_makecldf(self, args: argparse.Namespace): """ Implementations of this method should write the CLDF data curated by the dataset. :param args: An `argparse.Namespace` including attributes: \ - `writer`: :class:`CLDFWriter` instance """ args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'makecldf', self.id)) return NOOP