def test_update(tmppath): p = tmppath / 'test' with pytest.raises(ValueError): with update(p): pass # pragma: no cover with update(p, default={}) as obj: obj['a'] = 1 with update(p) as obj: assert obj['a'] == 1 obj['a'] = 2 with update(p) as obj: assert obj['a'] == 2
def run(args): dataset = get_dataset(args) with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md: modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()] contribs = dataset.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md["contributors"] = [contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] if communities: md['communities'] = [ {"identifier": community_id} for community_id in sorted(set(communities))] md.update( { "title": dataset.metadata.title, "access_right": "open", "keywords": sorted(set(md.get("keywords", []) + ["linguistics"] + modules)), "upload_type": "dataset", } ) if dataset.metadata.citation: md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \ "<blockquote>\n<p>{}</p>\n</blockquote>".format( html.escape(dataset.metadata.citation)) if dataset.metadata.zenodo_license: md['license'] = {'id': dataset.metadata.zenodo_license}
def to_replacements(self, filename): """Write a JSON file with 301s from merged glottolog_ref_ids.""" select_pairs = sa.select([Entry.refid.label('id'), Entry.id.label('replacement')])\ .where(Entry.id != Entry.refid)\ .order_by(Entry.id) with self.execute(select_pairs) as cursor: pairs = list(map(dict, cursor)) with jsonlib.update(filename, default=[], indent=4) as repls: repls.extend(pairs)
def to_replacements(self, filename): """Write a JSON file with 301s from merged glottolog_ref_ids.""" with self.connect() as conn: conn.row_factory = sqlite3.Row cursor = conn.execute('SELECT refid AS id, id AS replacement ' 'FROM entry WHERE id != refid ORDER BY id') pairs = map(dict, cursor) with jsonlib.update(filename, default=[], indent=4) as repls: repls.extend(pairs)
def to_replacements(self, filename, *, indent: typing.Optional[int] = 4): """Write a JSON file with 301s from merged ``glottolog_ref_id``s.""" select_pairs = (sa.select( Entry.refid.label('id'), Entry.id.label('replacement')).where( Entry.id != Entry.refid).order_by('replacement')) with self.execute(select_pairs) as result: pairs = result.mappings().all() with jsonlib.update(filename, default=[], indent=indent) as repls: # RowMapping is not JSON serializable repls.extend(map(dict, pairs))
def tdwg(args): """ Assign socities to TDWG regions """ try: import fiona from shapely.geometry import Point except ImportError: args.log.error('fiona and shapely must be installed for this command') return def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) with fiona.collection( args.repos.path("geo", "level2-shape/level2.shp").as_posix(), "r") as source: regions = [f for f in source] with update(args.repos.path("geo", "societies_tdwg.json"), default={}, indent=4) as soc_tdwg: for ds in args.repos.datasets: for soc in ds.societies: spec = soc_tdwg.get( soc.id, dict(lat=soc.Lat, lon=soc.Long, name=None, code=None)) if isclose(spec['lat'], soc.Lat) \ and isclose(spec['lon'], soc.Long) \ and spec['code']: continue region, dist = geo.match(Point(spec['lon'], spec['lat']), regions) spec['name'] = region['properties']['REGION_NAM'] spec['code'] = region['properties']['TDWG_CODE'] if dist == 0: args.log.info('{0} contained in region {1}'.format( soc, spec['name'])) else: args.log.warn( 'assigning {0} to nearest region {1}, distance {2}'. format(soc, region['properties']['REGION_NAM'], dist)) soc_tdwg[soc.id] = spec
def dl2cdstar(args): app = app_name(args.project) if not app: args.log.error('cannot parse package name') return try: from cdstarcat.catalog import Catalog except ImportError: args.log.error('pip install cdstarcat') return title_pattern = re.compile('%s (?P<version>[0-9.]+) - downloads' % re.escape(app)) title = '{0} {1} - downloads'.format(app, args.version) pkg_dir = args.project.joinpath(app) with Catalog( Path(os.environ['CDSTAR_CATALOG']), cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: obj = cat.api.get_object() obj.metadata = {"creator": "pycdstar", "title": title} if args.args: obj.metadata["description"] = args.args[0] for fname in pkg_dir.joinpath('static', 'download').iterdir(): if fname.is_file() and not fname.name.startswith('.'): print(fname.name) obj.add_bitstream( fname=fname.as_posix(), name=fname.name.replace('-', '_')) cat.add(obj) fname = pkg_dir.joinpath('static', 'downloads.json') with update(fname, default={}, indent=4) as downloads: for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items(): if 'metadata' in spec and 'title' in spec['metadata']: match = title_pattern.match(spec['metadata']['title']) if match: if match.group('version') not in downloads: spec['oid'] = oid downloads[match.group('version')] = spec args.log.info('{0} written'.format(fname)) args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))
def cdstar(args): try: from cdstarcat.catalog import Catalog except ImportError: args.log.error('pip install cdstarcat') return title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads') with Catalog(Path(os.environ['CDSTAR_CATALOG']), cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: obj = cat.api.get_object() obj.metadata = { "creator": "pycdstar", "title": "glottolog %s - downloads" % args.args[0], "description": "Custom downloads for release %s of " "[Glottolog](http://glottolog.org)" % args.args[0], } for fname in args.pkg_dir.joinpath('static', 'download').iterdir(): if fname.is_file() and not fname.name.startswith('.'): print(fname.name) obj.add_bitstream(fname=fname.as_posix(), name=fname.name.replace('-', '_')) cat.add(obj) fname = args.pkg_dir.joinpath('static', 'downloads.json') with update(fname, default={}, indent=4) as downloads: for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items(): if 'metadata' in spec and 'title' in spec['metadata']: match = title_pattern.match(spec['metadata']['title']) if match: if match.group('version') not in downloads: spec['oid'] = oid downloads[match.group('version')] = spec args.log.info('{0} written'.format(fname)) args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))
def run(args): # # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams # rather than creating a new object! # dlfname = args.pkg_dir.joinpath('static', 'downloads.json') downloads = load(dlfname) release = args.version title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads') with args.catalog_class(args.catalog, args.url, args.user, args.pwd) as cat: # # FIXME: there must be a way to overwrite old releases in case of bugfixes! # if release in downloads: print('adding bitstreams to {0}'.format(downloads[release]['oid'])) # This is a bugfix release, we don't have to create a new object on CDSTAR! obj = cat.api.get_object(uid=downloads[release]['oid']) else: obj = cat.api.get_object() obj.metadata = { "creator": "pycdstar", "title": "glottolog %s - downloads" % release, "description": "Custom downloads for release %s of " "[Glottolog](http://glottolog.org)" % release, } bitstreams = obj.bitstreams[:] for fname in args.pkg_dir.joinpath('static', 'download').iterdir(): if fname.is_file() and not fname.name.startswith('.'): bsname = fname.name.replace('-', '_') bitstream, skip = None, False for bitstream in bitstreams: if bitstream.id == bsname: break else: bitstream = None if bitstream: if bitstream._properties['checksum'] != md5(fname): bitstream.delete() else: skip = True print('skipping {0}'.format(fname.name)) if not skip: print(fname.name) obj.add_bitstream(fname=fname.as_posix(), name=bsname) obj.read() cat.add(obj, update=True) with update(dlfname, default=collections.OrderedDict(), indent=4, sort_keys=True) as downloads: for oid, spec in load(args.catalog).items(): if 'metadata' in spec and 'title' in spec['metadata']: match = title_pattern.match(spec['metadata']['title']) if match: if (match.group('version') not in downloads ) or match.group('version') == release: args.log.info('update info for release {0}'.format( match.group('version'))) spec['oid'] = oid downloads[match.group('version')] = spec args.log.info('{0} written'.format(dlfname)) args.log.info('{0}'.format(args.catalog))
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ # # Now that we loaded all languoids and refs, we can compute the MED values. # meds = defaultdict(list) for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\ select l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages from languagesource as ls, language as l, source as s, ref as r where ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk order by l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk """): meds[lpk].append((spk, sid, sname, med_type, year, pages)) # The last one is the overall MED # Now weed out the "newer but worse" sources: for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items(): relevant, lastyear = [], 10000 for spk, sid, sname, med_type, year, pages in sources: if year and year < lastyear: # If year is more recent, this is a "newer but worse" item relevant.append((spk, sid, sname, med_type, year, pages)) lastyear = year meds[lpk] = relevant med_param = common.Parameter.get('med') med_domain = {de.id: de for de in med_param.domain} contrib = common.Contribution.get('glottolog') for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))): l.update_jsondata(meds=[ (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]]) if not meds[l.pk]: continue med = meds[l.pk][0] # Record the overall MED as value for the 'med' Parameter: vs = common.ValueSet( id=idjoin('med', l.id), contribution=contrib, parameter=med_param, language=l, ) DBSession.add(common.Value( id=idjoin('med', l.id), name=getattr(args.repos.med_types, med[3]).name, domainelement=med_domain[idjoin('med', med[3])], valueset=vs, )) DBSession.flush() DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk)) recreate_treeclosure() macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\ select de.pk, de.id, de.name from domainelement as de, parameter as p where de.parameter_pk = p.pk and p.id = 'macroarea' """)} for lid, lpk, cpk, ppk, mas in DBSession.execute("""\ select l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk) from language as l, treeclosuretable as t, parameter as p, valueset as vs, value as v where l.pk = t.parent_pk and t.child_pk = vs.language_pk and vs.parameter_pk = p.pk and p.id = 'macroarea' and v.valueset_pk = vs.pk and l.pk not in ( select language_pk from valueset as _vs, parameter as _p where _vs.parameter_pk = _p.pk and _p.id = 'macroarea' ) group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""): for i, mapk in enumerate(mas): if i == 0: vs = common.ValueSet( id=idjoin('macroarea', lid), language_pk=lpk, parameter_pk=ppk, contribution_pk=cpk) DBSession.add(common.Value( id=idjoin(macroareas[mapk][0], lid), name=macroareas[mapk][1], domainelement_pk=mapk, valueset=vs)) for vs in DBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'macroarea')\ .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)): vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values]) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): raise ValueError(row) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} for vsid, vspk in valuesets.items(): if vsid.startswith('macroarea-'): DBSession.add(common.ValueSetReference( source_pk=refs[args.repos.macroareas.__defaults__['reference_id']], valueset_pk=vspk)) for vs in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'aes'): if vs.jsondata['reference_id']: DBSession.add(common.ValueSetReference( source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk)) for lang in args.repos.languoids(): if lang.category == args.repos.language_types.bookkeeping.category: continue clf = lang.classification_comment if clf: for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]: if getattr(clf, attr_ + 'refs'): if split_items(lang.cfg['classification'][attr_ + 'refs']) != \ split_items(lang.cfg['classification'].get(attr_)): vspk = valuesets['{0}-{1}'.format(pid, lang.id)] for ref in getattr(clf, attr_ + 'refs'): spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def cdstar(args): try: from cdstarcat.catalog import Catalog except ImportError: args.log.error('pip install cdstarcat') return # # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams # rather than creating a new object! # dlfname = args.pkg_dir.joinpath('static', 'downloads.json') downloads = load(dlfname) release = args.args[0] title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads') with Catalog( Path(os.environ['CDSTAR_CATALOG']), cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: # # FIXME: there must be a way to overwrite old releases in case of bugfixes! # if release in downloads: # This is a bugfix release, we don't have to create a new object on CDSTAR! obj = cat.api.get_object(uid=downloads[release]['oid']) else: obj = cat.api.get_object() obj.metadata = { "creator": "pycdstar", "title": "glottolog %s - downloads" % release, "description": "Custom downloads for release %s of " "[Glottolog](http://glottolog.org)" % release, } bitstreams = obj.bitstreams[:] for fname in args.pkg_dir.joinpath('static', 'download').iterdir(): if fname.is_file() and not fname.name.startswith('.'): bsname = fname.name.replace('-', '_') bitstream, skip = None, False for bitstream in bitstreams: if bitstream.id == bsname: break else: bitstream = None if bitstream: if bitstream._properties['checksum'] != md5(fname): bitstream.delete() else: skip = True print('skipping {0}'.format(fname.name)) if not skip: print(fname.name) obj.add_bitstream(fname=fname.as_posix(), name=bsname) cat.add(obj, update=True) with update(dlfname, default=collections.OrderedDict(), indent=4, sort_keys=True) as downloads: for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items(): if 'metadata' in spec and 'title' in spec['metadata']: match = title_pattern.match(spec['metadata']['title']) if match: if (match.group('version') not in downloads) or match.group('version') == release: args.log.info('update info for release {0}'.format(match.group('version'))) spec['oid'] = oid downloads[match.group('version')] = spec args.log.info('{0} written'.format(dlfname)) args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))
def run(args): ds = Dataset().cldf_reader() release_dir = args.out / '{0}_audio'.format(Dataset().id) zenodo_file_name = 'zenodo.json' if args.list: size = collections.Counter() number = collections.Counter() else: f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']} audio = args.out / 'audio' audio.mkdir(exist_ok=True) if not args.update_zenodo: for row in tqdm.tqdm([r for r in ds['media.csv']]): if args.list: size[row['mimetype']] += int(row['size']) number.update([row['mimetype']]) else: d = audio / f2c[row['Form_ID']] d.mkdir(exist_ok=True) url = ds.get_row_url('media.csv', row) target = d / '{}.{}'.format(row['ID'], url.split('.')[-1]) if (not target.exists()) or md5(target) != row['ID']: if (args.mimetype is None) or target.suffix.endswith( args.mimetype): create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k, str(number[k]), format_size(v)])) if args.create_release: assert audio.exists(), 'No folder "audio" found in {0}'.format( audio.resolve()) release_dir.mkdir(exist_ok=True) args.log.info('creating audio ZIP archive per parameter folder ...') try: zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w', zipfile.ZIP_DEFLATED) fp = args.out for root, dirs, files in tqdm.tqdm(os.walk(audio)): for f in files: if not f.startswith('.') and not f.startswith('__')\ and ((args.mimetype is None) or f.endswith(args.mimetype)): zipf.write(os.path.join(root, f), os.path.relpath(os.path.join(root, f), fp)) zipf.close() except Exception as e: args.log.error(e) raise def contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } with jsonlib.update(release_dir / zenodo_file_name, indent=4, default=collections.OrderedDict()) as md: contribs = Dataset().dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md['contributors'] = [contrib(p) for p in contributors] if COMMUNITIES: md['communities'] = [{ 'id': community_id } for community_id in COMMUNITIES] md.update({ 'title': '{0} Audio Files'.format(Dataset().metadata.title), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'video', 'version': VERSION, 'related_identifiers': [ { 'scheme': 'doi', 'identifier': '10.5281/zenodo.4309141', 'relation': 'isPartOf' }, { 'scheme': 'url', 'identifier': '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX, Dataset().id, VERSION), 'relation': 'isSupplementTo' }, ], }) if Dataset().metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': Dataset().metadata.url, 'relation': 'isAlternateIdentifier' }) md['description'] = html.escape( DESCRIPTION.format( GITHUB_PREFIX, Dataset().id, Dataset().metadata.url if Dataset().metadata.url else '', VERSION)) license_md = '' if Dataset().metadata.zenodo_license: md['license'] = {'id': Dataset().metadata.zenodo_license} license_md = LISENCE.format(Dataset().metadata.zenodo_license) DataDir(release_dir).write( 'README.md', RELEASE_NOTE.format(md['title'], GITHUB_PREFIX, Dataset().id, Dataset().metadata.title, license_md)) if args.update_zenodo: assert release_dir.exists() assert (release_dir / zenodo_file_name).exists() md = {} md.update(jsonlib.load(release_dir / zenodo_file_name)) api_url = API_URL zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN) rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != zenodoclient.models.PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def run(args): ds = get_dataset(args) ds_cldf = ds.cldf_reader() release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA) if ds_cldf.get('media.csv', None) is None: # pragma: no cover args.log.error('Dataset has no media.csv') raise ParserError if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi): args.log.error('Invalid passed DOI') raise ParserError if args.update_zenodo: if not release_dir.exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir)) raise ParserError if not (release_dir / ZENODO_FILE_NAME).exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir / ZENODO_FILE_NAME)) raise ParserError if args.create_release: args.log.error( 'You cannot create the release and update zenodo at the same time.' ) raise ParserError if args.create_release: if not args.parent_doi: args.log.error( 'The corresponding DOI is required (via --parent-doi).') raise ParserError mime_types = None if args.mimetype: mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] if args.list: size = collections.Counter() number = collections.Counter() else: media_dir = args.out / MEDIA media_dir.mkdir(exist_ok=True) media = [] if not args.update_zenodo: used_file_extensions = set() with UnicodeWriter(media_dir / INDEX_CSV if not args.list else None) as w: for i, row in enumerate( tqdm.tqdm([r for r in ds_cldf['media.csv']], desc='Getting {0} items'.format(MEDIA))): url = ds_cldf.get_row_url('media.csv', row) if isinstance(url, rfc3986.URIReference): url = url.normalize().unsplit() row['URL'] = url f_ext = url.split('.')[-1].lower() if args.debug and i > 500: break if (mime_types is None) or f_ext in mime_types\ or any(row['mimetype'].startswith(x) for x in mime_types): if args.list: m = '{0} ({1})'.format(row['mimetype'], f_ext) size[m] += int(row['size']) number.update([m]) else: used_file_extensions.add(f_ext.lower()) d = media_dir / row['ID'][:2] d.mkdir(exist_ok=True) fn = '.'.join([row['ID'], f_ext]) target = d / fn row['local_path'] = pathlib.Path(row['ID'][:2]) / fn if i == 0: w.writerow(row) w.writerow(row.values()) media.append(target) if (not target.exists()) or md5(target) != row['ID']: _create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k.ljust(20), str(number[k]), format_size(v)])) return # Waiting for the download threads to finish if 'download_threads' in globals(): for t in download_threads: t.join() if args.create_release: assert media_dir.exists(), 'No folder "{0}" found in {1}'.format( MEDIA, media_dir.resolve()) release_dir.mkdir(exist_ok=True) media.append(media_dir / INDEX_CSV) try: zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)), 'w', zipfile.ZIP_DEFLATED) fp = args.out for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)): zipf.write(str(f), str(os.path.relpath(str(f), str(fp)))) zipf.close() except Exception as e: args.log.error(e) raise def _contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } version_v = git_describe('.').split('-')[0] version = version_v.replace('v', '') git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '') with jsonlib.update(release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md: contribs = ds.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [_contrib(p) for p in creators] if contributors: md['contributors'] = [_contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] + \ COMMUNITIES if communities and not args.debug: md['communities'] = [{ "identifier": community_id } for community_id in sorted(set(communities))] md.update({ 'title': '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'dataset', 'publication_date': datetime.today().strftime('%Y-%m-%d'), 'version': version, 'related_identifiers': [ { 'scheme': 'url', 'identifier': '{0}/tree/{1}'.format(git_url, version_v), 'relation': 'isSupplementTo' }, ], }) if args.parent_doi: md['related_identifiers'].append({ 'scheme': 'doi', 'identifier': args.parent_doi, 'relation': 'isPartOf' }) supplement_to = " - Supplement to dataset " \ "<a href='https://doi.org/{0}'>{1}</a> ".format( args.parent_doi, ds.metadata.title) # noqa: E122 if ds.metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': ds.metadata.url, 'relation': 'isAlternateIdentifier' }) formats = ', '.join(sorted(used_file_extensions)) descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else '' online_url, online = '', '' if ds.metadata.url: online_url = ds.metadata.url online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format( online_url) md['description'] = html.escape( DESCRIPTION.format( url=online_url, formats=' ({0})'.format(formats) if formats else '', title=md['title'], supplement_to=supplement_to, descr=descr, online=online)) license_md = '' if ds.metadata.zenodo_license: md['license'] = {'id': ds.metadata.zenodo_license} license_md = LICENCE.format(ds.metadata.zenodo_license) DataDir(release_dir).write( 'README.md', README.format( title=md['title'], doi='https://doi.org/{0}'.format(args.parent_doi), ds_title=ds.metadata.title, license=license_md, formats=' ({0})'.format(formats) if formats else '', media=MEDIA, index=INDEX_CSV)) if args.update_zenodo: md = {} md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME)) if args.debug: api_url = API_URL_SANDBOX access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN') else: api_url = API_URL access_token = ACCESS_TOKEN zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=access_token) try: rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) except Exception as e: args.log.error( 'Check connection and credentials for accessing Zenodo.\n{0}'. format(e)) return latest_version = rec.links['latest'].split('/')[-1] if latest_version != args.update_zenodo: args.log.warn( 'Passed deposit ID does not refer to latest version {0}!'. format(latest_version)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Version: ' + rec.metadata.version) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def index(self): with update(self.path('datasets.json'), default=[], indent=4) as data: return [Dataset(md) for md in data]
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = args.args[0] def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk)) if clf.familyrefs: if items(lang.cfg['classification']['familyrefs']) != \ items(lang.cfg['classification'].get('family')): vspk = valuesets['fc-{0}'.format(lang.id)] for ref in clf.familyrefs: spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))