def from_stream(cls, stream, spec=None): from csvw.metadata import TableGroup cldf = Dataset(TableGroup(fname=pathlib.Path('tmp.json'))) cldf.add_component('ExampleTable') spec = spec or CorpusSpec() cols = cls.get_column_names(cldf) igts = [ IGT( id=igt[cols.id], gloss=igt[cols.gloss].split('\\t'), phrase=igt[cols.phrase].split('\\t'), language=igt.get(cols.language), properties=igt, spec=spec, ) for igt in reader(stream.read().splitlines(), dicts=True)] return cls(igts, spec=spec)
def run(args): ds = Dataset().cldf_reader() release_dir = args.out / '{0}_audio'.format(Dataset().id) zenodo_file_name = 'zenodo.json' if args.list: size = collections.Counter() number = collections.Counter() else: f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']} audio = args.out / 'audio' audio.mkdir(exist_ok=True) if not args.update_zenodo: for row in tqdm.tqdm([r for r in ds['media.csv']]): if args.list: size[row['mimetype']] += int(row['size']) number.update([row['mimetype']]) else: d = audio / f2c[row['Form_ID']] d.mkdir(exist_ok=True) url = ds.get_row_url('media.csv', row) target = d / '{}.{}'.format(row['ID'], url.split('.')[-1]) if (not target.exists()) or md5(target) != row['ID']: if (args.mimetype is None) or target.suffix.endswith( args.mimetype): create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k, str(number[k]), format_size(v)])) if args.create_release: assert audio.exists(), 'No folder "audio" found in {0}'.format( audio.resolve()) release_dir.mkdir(exist_ok=True) args.log.info('creating audio ZIP archive per parameter folder ...') try: zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w', zipfile.ZIP_DEFLATED) fp = args.out for root, dirs, files in tqdm.tqdm(os.walk(audio)): for f in files: if not f.startswith('.') and not f.startswith('__')\ and ((args.mimetype is None) or f.endswith(args.mimetype)): zipf.write(os.path.join(root, f), os.path.relpath(os.path.join(root, f), fp)) zipf.close() except Exception as e: args.log.error(e) raise def contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } with jsonlib.update(release_dir / zenodo_file_name, indent=4, default=collections.OrderedDict()) as md: contribs = Dataset().dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md['contributors'] = [contrib(p) for p in contributors] if COMMUNITIES: md['communities'] = [{ 'id': community_id } for community_id in COMMUNITIES] md.update({ 'title': '{0} Audio Files'.format(Dataset().metadata.title), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'video', 'version': VERSION, 'related_identifiers': [ { 'scheme': 'doi', 'identifier': '10.5281/zenodo.4309141', 'relation': 'isPartOf' }, { 'scheme': 'url', 'identifier': '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX, Dataset().id, VERSION), 'relation': 'isSupplementTo' }, ], }) if Dataset().metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': Dataset().metadata.url, 'relation': 'isAlternateIdentifier' }) md['description'] = html.escape( DESCRIPTION.format( GITHUB_PREFIX, Dataset().id, Dataset().metadata.url if Dataset().metadata.url else '', VERSION)) license_md = '' if Dataset().metadata.zenodo_license: md['license'] = {'id': Dataset().metadata.zenodo_license} license_md = LISENCE.format(Dataset().metadata.zenodo_license) DataDir(release_dir).write( 'README.md', RELEASE_NOTE.format(md['title'], GITHUB_PREFIX, Dataset().id, Dataset().metadata.title, license_md)) if args.update_zenodo: assert release_dir.exists() assert (release_dir / zenodo_file_name).exists() md = {} md.update(jsonlib.load(release_dir / zenodo_file_name)) api_url = API_URL zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN) rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != zenodoclient.models.PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')