Пример #1
0
def test_update(tmppath):
    p = tmppath / 'test'
    with pytest.raises(ValueError):
        with update(p):
            pass  # pragma: no cover

    with update(p, default={}) as obj:
        obj['a'] = 1

    with update(p) as obj:
        assert obj['a'] == 1
        obj['a'] = 2

    with update(p) as obj:
        assert obj['a'] == 2
Пример #2
0
def run(args):
    dataset = get_dataset(args)
    with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md:
        modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()]
        contribs = dataset.dir / 'CONTRIBUTORS.md'
        creators, contributors = get_creators_and_contributors(
            contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False)
        if creators:
            md['creators'] = [contrib(p) for p in creators]
        if contributors:
            md["contributors"] = [contrib(p) for p in contributors]
        communities = [r["identifier"] for r in md.get("communities", [])] + \
                      [c.strip() for c in nfilter(args.communities.split(','))]
        if communities:
            md['communities'] = [
                {"identifier": community_id} for community_id in sorted(set(communities))]
        md.update(
            {
                "title": dataset.metadata.title,
                "access_right": "open",
                "keywords": sorted(set(md.get("keywords", []) + ["linguistics"] + modules)),
                "upload_type": "dataset",
            }
        )
        if dataset.metadata.citation:
            md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \
                                "<blockquote>\n<p>{}</p>\n</blockquote>".format(
                html.escape(dataset.metadata.citation))
        if dataset.metadata.zenodo_license:
            md['license'] = {'id': dataset.metadata.zenodo_license}
Пример #3
0
 def to_replacements(self, filename):
     """Write a JSON file with 301s from merged glottolog_ref_ids."""
     select_pairs = sa.select([Entry.refid.label('id'), Entry.id.label('replacement')])\
         .where(Entry.id != Entry.refid)\
         .order_by(Entry.id)
     with self.execute(select_pairs) as cursor:
         pairs = list(map(dict, cursor))
     with jsonlib.update(filename, default=[], indent=4) as repls:
         repls.extend(pairs)
Пример #4
0
 def to_replacements(self, filename):
     """Write a JSON file with 301s from merged glottolog_ref_ids."""
     with self.connect() as conn:
         conn.row_factory = sqlite3.Row
         cursor = conn.execute('SELECT refid AS id, id AS replacement '
                               'FROM entry WHERE id != refid ORDER BY id')
         pairs = map(dict, cursor)
     with jsonlib.update(filename, default=[], indent=4) as repls:
         repls.extend(pairs)
Пример #5
0
    def to_replacements(self, filename, *, indent: typing.Optional[int] = 4):
        """Write a JSON file with 301s from merged ``glottolog_ref_id``s."""
        select_pairs = (sa.select(
            Entry.refid.label('id'), Entry.id.label('replacement')).where(
                Entry.id != Entry.refid).order_by('replacement'))

        with self.execute(select_pairs) as result:
            pairs = result.mappings().all()

        with jsonlib.update(filename, default=[], indent=indent) as repls:
            # RowMapping is not JSON serializable
            repls.extend(map(dict, pairs))
Пример #6
0
def tdwg(args):
    """
    Assign socities to TDWG regions
    """
    try:
        import fiona
        from shapely.geometry import Point
    except ImportError:
        args.log.error('fiona and shapely must be installed for this command')
        return

    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    with fiona.collection(
            args.repos.path("geo", "level2-shape/level2.shp").as_posix(),
            "r") as source:
        regions = [f for f in source]

    with update(args.repos.path("geo", "societies_tdwg.json"),
                default={},
                indent=4) as soc_tdwg:
        for ds in args.repos.datasets:
            for soc in ds.societies:
                spec = soc_tdwg.get(
                    soc.id,
                    dict(lat=soc.Lat, lon=soc.Long, name=None, code=None))
                if isclose(spec['lat'], soc.Lat) \
                        and isclose(spec['lon'], soc.Long) \
                        and spec['code']:
                    continue

                region, dist = geo.match(Point(spec['lon'], spec['lat']),
                                         regions)
                spec['name'] = region['properties']['REGION_NAM']
                spec['code'] = region['properties']['TDWG_CODE']

                if dist == 0:
                    args.log.info('{0} contained in region {1}'.format(
                        soc, spec['name']))
                else:
                    args.log.warn(
                        'assigning {0} to nearest region {1}, distance {2}'.
                        format(soc, region['properties']['REGION_NAM'], dist))

                soc_tdwg[soc.id] = spec
Пример #7
0
def dl2cdstar(args):
    app = app_name(args.project)
    if not app:
        args.log.error('cannot parse package name')
        return

    try:
        from cdstarcat.catalog import Catalog
    except ImportError:
        args.log.error('pip install cdstarcat')
        return

    title_pattern = re.compile('%s (?P<version>[0-9.]+) - downloads' % re.escape(app))
    title = '{0} {1} - downloads'.format(app, args.version)
    pkg_dir = args.project.joinpath(app)
    with Catalog(
            Path(os.environ['CDSTAR_CATALOG']),
            cdstar_url=os.environ['CDSTAR_URL'],
            cdstar_user=os.environ['CDSTAR_USER'],
            cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
        obj = cat.api.get_object()
        obj.metadata = {"creator": "pycdstar", "title": title}
        if args.args:
            obj.metadata["description"] = args.args[0]
        for fname in pkg_dir.joinpath('static', 'download').iterdir():
            if fname.is_file() and not fname.name.startswith('.'):
                print(fname.name)
                obj.add_bitstream(
                    fname=fname.as_posix(), name=fname.name.replace('-', '_'))
        cat.add(obj)

    fname = pkg_dir.joinpath('static', 'downloads.json')
    with update(fname, default={}, indent=4) as downloads:
        for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items():
            if 'metadata' in spec and 'title' in spec['metadata']:
                match = title_pattern.match(spec['metadata']['title'])
                if match:
                    if match.group('version') not in downloads:
                        spec['oid'] = oid
                        downloads[match.group('version')] = spec
    args.log.info('{0} written'.format(fname))
    args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))
Пример #8
0
def cdstar(args):
    try:
        from cdstarcat.catalog import Catalog
    except ImportError:
        args.log.error('pip install cdstarcat')
        return

    title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads')
    with Catalog(Path(os.environ['CDSTAR_CATALOG']),
                 cdstar_url=os.environ['CDSTAR_URL'],
                 cdstar_user=os.environ['CDSTAR_USER'],
                 cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
        obj = cat.api.get_object()
        obj.metadata = {
            "creator":
            "pycdstar",
            "title":
            "glottolog %s - downloads" % args.args[0],
            "description":
            "Custom downloads for release %s of "
            "[Glottolog](http://glottolog.org)" % args.args[0],
        }
        for fname in args.pkg_dir.joinpath('static', 'download').iterdir():
            if fname.is_file() and not fname.name.startswith('.'):
                print(fname.name)
                obj.add_bitstream(fname=fname.as_posix(),
                                  name=fname.name.replace('-', '_'))
        cat.add(obj)

    fname = args.pkg_dir.joinpath('static', 'downloads.json')
    with update(fname, default={}, indent=4) as downloads:
        for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items():
            if 'metadata' in spec and 'title' in spec['metadata']:
                match = title_pattern.match(spec['metadata']['title'])
                if match:
                    if match.group('version') not in downloads:
                        spec['oid'] = oid
                        downloads[match.group('version')] = spec
    args.log.info('{0} written'.format(fname))
    args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))
Пример #9
0
def run(args):
    #
    # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams
    # rather than creating a new object!
    #
    dlfname = args.pkg_dir.joinpath('static', 'downloads.json')
    downloads = load(dlfname)
    release = args.version
    title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads')
    with args.catalog_class(args.catalog, args.url, args.user,
                            args.pwd) as cat:
        #
        # FIXME: there must be a way to overwrite old releases in case of bugfixes!
        #
        if release in downloads:
            print('adding bitstreams to {0}'.format(downloads[release]['oid']))
            # This is a bugfix release, we don't have to create a new object on CDSTAR!
            obj = cat.api.get_object(uid=downloads[release]['oid'])
        else:
            obj = cat.api.get_object()
            obj.metadata = {
                "creator":
                "pycdstar",
                "title":
                "glottolog %s - downloads" % release,
                "description":
                "Custom downloads for release %s of "
                "[Glottolog](http://glottolog.org)" % release,
            }
        bitstreams = obj.bitstreams[:]
        for fname in args.pkg_dir.joinpath('static', 'download').iterdir():
            if fname.is_file() and not fname.name.startswith('.'):
                bsname = fname.name.replace('-', '_')
                bitstream, skip = None, False
                for bitstream in bitstreams:
                    if bitstream.id == bsname:
                        break
                else:
                    bitstream = None
                if bitstream:
                    if bitstream._properties['checksum'] != md5(fname):
                        bitstream.delete()
                    else:
                        skip = True
                        print('skipping {0}'.format(fname.name))
                if not skip:
                    print(fname.name)
                    obj.add_bitstream(fname=fname.as_posix(), name=bsname)
        obj.read()
        cat.add(obj, update=True)

    with update(dlfname,
                default=collections.OrderedDict(),
                indent=4,
                sort_keys=True) as downloads:
        for oid, spec in load(args.catalog).items():
            if 'metadata' in spec and 'title' in spec['metadata']:
                match = title_pattern.match(spec['metadata']['title'])
                if match:
                    if (match.group('version') not in downloads
                        ) or match.group('version') == release:
                        args.log.info('update info for release {0}'.format(
                            match.group('version')))
                        spec['oid'] = oid
                        downloads[match.group('version')] = spec
    args.log.info('{0} written'.format(dlfname))
    args.log.info('{0}'.format(args.catalog))
Пример #10
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    #
    # Now that we loaded all languoids and refs, we can compute the MED values.
    #
    meds = defaultdict(list)
    for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\
select
  l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages
from
  languagesource as ls,
  language as l,
  source as s,
  ref as r
where
  ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk
order by
  l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk
"""):
        meds[lpk].append((spk, sid, sname, med_type, year, pages))  # The last one is the overall MED

    # Now weed out the "newer but worse" sources:
    for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items():
        relevant, lastyear = [], 10000
        for spk, sid, sname, med_type, year, pages in sources:
            if year and year < lastyear:  # If year is more recent, this is a "newer but worse" item
                relevant.append((spk, sid, sname, med_type, year, pages))
                lastyear = year
        meds[lpk] = relevant

    med_param = common.Parameter.get('med')
    med_domain = {de.id: de for de in med_param.domain}
    contrib = common.Contribution.get('glottolog')

    for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))):
        l.update_jsondata(meds=[
            (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]])
        if not meds[l.pk]:
            continue

        med = meds[l.pk][0]
        # Record the overall MED as value for the 'med' Parameter:
        vs = common.ValueSet(
            id=idjoin('med', l.id),
            contribution=contrib,
            parameter=med_param,
            language=l,
        )
        DBSession.add(common.Value(
            id=idjoin('med', l.id),
            name=getattr(args.repos.med_types, med[3]).name,
            domainelement=med_domain[idjoin('med', med[3])],
            valueset=vs,
        ))
        DBSession.flush()
        DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk))

    recreate_treeclosure()

    macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\
select de.pk, de.id, de.name
from domainelement as de, parameter as p
where de.parameter_pk = p.pk and p.id = 'macroarea'
""")}

    for lid, lpk, cpk, ppk, mas in DBSession.execute("""\
select
  l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk)
from
  language as l,
  treeclosuretable as t,
  parameter as p,
  valueset as vs,
  value as v
where
  l.pk = t.parent_pk and
  t.child_pk = vs.language_pk and
  vs.parameter_pk = p.pk and
  p.id = 'macroarea' and
  v.valueset_pk = vs.pk and
  l.pk not in (
    select language_pk 
    from valueset as _vs, parameter as _p 
    where _vs.parameter_pk = _p.pk and _p.id = 'macroarea'
  )
group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""):
        for i, mapk in enumerate(mas):
            if i == 0:
                vs = common.ValueSet(
                    id=idjoin('macroarea', lid),
                    language_pk=lpk,
                    parameter_pk=ppk,
                    contribution_pk=cpk)
            DBSession.add(common.Value(
                id=idjoin(macroareas[mapk][0], lid),
                name=macroareas[mapk][1],
                domainelement_pk=mapk,
                valueset=vs))

    for vs in DBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == 'macroarea')\
            .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)):
        vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values])

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        raise ValueError(row)

    version = assert_release(args.repos.repos)
    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = version

    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}
    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}

    for vsid, vspk in valuesets.items():
        if vsid.startswith('macroarea-'):
            DBSession.add(common.ValueSetReference(
                source_pk=refs[args.repos.macroareas.__defaults__['reference_id']],
                valueset_pk=vspk))

    for vs in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == 'aes'):
        if vs.jsondata['reference_id']:
            DBSession.add(common.ValueSetReference(
                source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk))

    for lang in args.repos.languoids():
        if lang.category == args.repos.language_types.bookkeeping.category:
            continue
        clf = lang.classification_comment
        if clf:
            for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]:
                if getattr(clf, attr_ + 'refs'):
                    if split_items(lang.cfg['classification'][attr_ + 'refs']) != \
                            split_items(lang.cfg['classification'].get(attr_)):
                        vspk = valuesets['{0}-{1}'.format(pid, lang.id)]
                        for ref in getattr(clf, attr_ + 'refs'):
                            spk = refs.get(ref.key)
                            if spk:
                                DBSession.add(
                                    common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
Пример #11
0
def cdstar(args):
    try:
        from cdstarcat.catalog import Catalog
    except ImportError:
        args.log.error('pip install cdstarcat')
        return

    #
    # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams
    # rather than creating a new object!
    #
    dlfname = args.pkg_dir.joinpath('static', 'downloads.json')
    downloads = load(dlfname)
    release = args.args[0]
    title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads')
    with Catalog(
            Path(os.environ['CDSTAR_CATALOG']),
            cdstar_url=os.environ['CDSTAR_URL'],
            cdstar_user=os.environ['CDSTAR_USER'],
            cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
        #
        # FIXME: there must be a way to overwrite old releases in case of bugfixes!
        #
        if release in downloads:
            # This is a bugfix release, we don't have to create a new object on CDSTAR!
            obj = cat.api.get_object(uid=downloads[release]['oid'])
        else:
            obj = cat.api.get_object()
            obj.metadata = {
                "creator": "pycdstar",
                "title": "glottolog %s - downloads" % release,
                "description": "Custom downloads for release %s of "
                               "[Glottolog](http://glottolog.org)" % release,
            }
        bitstreams = obj.bitstreams[:]
        for fname in args.pkg_dir.joinpath('static', 'download').iterdir():
            if fname.is_file() and not fname.name.startswith('.'):
                bsname = fname.name.replace('-', '_')
                bitstream, skip = None, False
                for bitstream in bitstreams:
                    if bitstream.id == bsname:
                        break
                else:
                    bitstream = None
                if bitstream:
                    if bitstream._properties['checksum'] != md5(fname):
                        bitstream.delete()
                    else:
                        skip = True
                        print('skipping {0}'.format(fname.name))
                if not skip:
                    print(fname.name)
                    obj.add_bitstream(fname=fname.as_posix(), name=bsname)
        cat.add(obj, update=True)

    with update(dlfname, default=collections.OrderedDict(), indent=4, sort_keys=True) as downloads:
        for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items():
            if 'metadata' in spec and 'title' in spec['metadata']:
                match = title_pattern.match(spec['metadata']['title'])
                if match:
                    if (match.group('version') not in downloads) or match.group('version') == release:
                        args.log.info('update info for release {0}'.format(match.group('version')))
                        spec['oid'] = oid
                        downloads[match.group('version')] = spec
    args.log.info('{0} written'.format(dlfname))
    args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))
Пример #12
0
def run(args):
    ds = Dataset().cldf_reader()

    release_dir = args.out / '{0}_audio'.format(Dataset().id)
    zenodo_file_name = 'zenodo.json'

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']}
        audio = args.out / 'audio'
        audio.mkdir(exist_ok=True)

    if not args.update_zenodo:
        for row in tqdm.tqdm([r for r in ds['media.csv']]):
            if args.list:
                size[row['mimetype']] += int(row['size'])
                number.update([row['mimetype']])
            else:
                d = audio / f2c[row['Form_ID']]
                d.mkdir(exist_ok=True)
                url = ds.get_row_url('media.csv', row)
                target = d / '{}.{}'.format(row['ID'], url.split('.')[-1])
                if (not target.exists()) or md5(target) != row['ID']:
                    if (args.mimetype is None) or target.suffix.endswith(
                            args.mimetype):
                        create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k, str(number[k]), format_size(v)]))

    if args.create_release:
        assert audio.exists(), 'No folder "audio" found in {0}'.format(
            audio.resolve())

        release_dir.mkdir(exist_ok=True)

        args.log.info('creating audio ZIP archive per parameter folder ...')
        try:
            zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w',
                                   zipfile.ZIP_DEFLATED)
            fp = args.out
            for root, dirs, files in tqdm.tqdm(os.walk(audio)):
                for f in files:
                    if not f.startswith('.') and not f.startswith('__')\
                            and ((args.mimetype is None) or f.endswith(args.mimetype)):
                        zipf.write(os.path.join(root, f),
                                   os.path.relpath(os.path.join(root, f), fp))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        with jsonlib.update(release_dir / zenodo_file_name,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = Dataset().dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [contrib(p) for p in contributors]
            if COMMUNITIES:
                md['communities'] = [{
                    'id': community_id
                } for community_id in COMMUNITIES]
            md.update({
                'title':
                '{0} Audio Files'.format(Dataset().metadata.title),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'video',
                'version':
                VERSION,
                'related_identifiers': [
                    {
                        'scheme': 'doi',
                        'identifier': '10.5281/zenodo.4309141',
                        'relation': 'isPartOf'
                    },
                    {
                        'scheme':
                        'url',
                        'identifier':
                        '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX,
                                                  Dataset().id, VERSION),
                        'relation':
                        'isSupplementTo'
                    },
                ],
            })
            if Dataset().metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    Dataset().metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })
            md['description'] = html.escape(
                DESCRIPTION.format(
                    GITHUB_PREFIX,
                    Dataset().id,
                    Dataset().metadata.url if Dataset().metadata.url else '',
                    VERSION))

            license_md = ''
            if Dataset().metadata.zenodo_license:
                md['license'] = {'id': Dataset().metadata.zenodo_license}
                license_md = LISENCE.format(Dataset().metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                RELEASE_NOTE.format(md['title'], GITHUB_PREFIX,
                                    Dataset().id,
                                    Dataset().metadata.title, license_md))

    if args.update_zenodo:
        assert release_dir.exists()
        assert (release_dir / zenodo_file_name).exists()

        md = {}
        md.update(jsonlib.load(release_dir / zenodo_file_name))

        api_url = API_URL
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN)
        rec = api.record_from_id('{0}record/{1}'.format(
            zenodo_url, args.update_zenodo))
        args.log.info('  DOI:   ' + rec.metadata.doi)
        args.log.info('  Title: ' + rec.metadata.title)
        args.log.info('  Date:  ' + rec.metadata.publication_date)
        args.log.info('  Files: ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != zenodoclient.models.PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')
Пример #13
0
def run(args):

    ds = get_dataset(args)
    ds_cldf = ds.cldf_reader()
    release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA)

    if ds_cldf.get('media.csv', None) is None:  # pragma: no cover
        args.log.error('Dataset has no media.csv')
        raise ParserError
    if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi):
        args.log.error('Invalid passed DOI')
        raise ParserError
    if args.update_zenodo:
        if not release_dir.exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir))
            raise ParserError
        if not (release_dir / ZENODO_FILE_NAME).exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir / ZENODO_FILE_NAME))
            raise ParserError
        if args.create_release:
            args.log.error(
                'You cannot create the release and update zenodo at the same time.'
            )
            raise ParserError
    if args.create_release:
        if not args.parent_doi:
            args.log.error(
                'The corresponding DOI is required (via --parent-doi).')
            raise ParserError

    mime_types = None
    if args.mimetype:
        mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))]

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        media_dir = args.out / MEDIA
        media_dir.mkdir(exist_ok=True)
        media = []

    if not args.update_zenodo:
        used_file_extensions = set()
        with UnicodeWriter(media_dir /
                           INDEX_CSV if not args.list else None) as w:
            for i, row in enumerate(
                    tqdm.tqdm([r for r in ds_cldf['media.csv']],
                              desc='Getting {0} items'.format(MEDIA))):
                url = ds_cldf.get_row_url('media.csv', row)
                if isinstance(url, rfc3986.URIReference):
                    url = url.normalize().unsplit()
                    row['URL'] = url
                f_ext = url.split('.')[-1].lower()
                if args.debug and i > 500:
                    break
                if (mime_types is None) or f_ext in mime_types\
                        or any(row['mimetype'].startswith(x) for x in mime_types):
                    if args.list:
                        m = '{0} ({1})'.format(row['mimetype'], f_ext)
                        size[m] += int(row['size'])
                        number.update([m])
                    else:
                        used_file_extensions.add(f_ext.lower())
                        d = media_dir / row['ID'][:2]
                        d.mkdir(exist_ok=True)
                        fn = '.'.join([row['ID'], f_ext])
                        target = d / fn
                        row['local_path'] = pathlib.Path(row['ID'][:2]) / fn
                        if i == 0:
                            w.writerow(row)
                        w.writerow(row.values())
                        media.append(target)
                        if (not target.exists()) or md5(target) != row['ID']:
                            _create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k.ljust(20), str(number[k]), format_size(v)]))
        return

    # Waiting for the download threads to finish
    if 'download_threads' in globals():
        for t in download_threads:
            t.join()

    if args.create_release:
        assert media_dir.exists(), 'No folder "{0}" found in {1}'.format(
            MEDIA, media_dir.resolve())

        release_dir.mkdir(exist_ok=True)

        media.append(media_dir / INDEX_CSV)

        try:
            zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)),
                                   'w', zipfile.ZIP_DEFLATED)
            fp = args.out
            for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)):
                zipf.write(str(f), str(os.path.relpath(str(f), str(fp))))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def _contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        version_v = git_describe('.').split('-')[0]
        version = version_v.replace('v', '')
        git_url = [r for r in ds.repo.repo.remotes
                   if r.name == 'origin'][0].url.replace('.git', '')
        with jsonlib.update(release_dir / ZENODO_FILE_NAME,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = ds.dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [_contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [_contrib(p) for p in contributors]
            communities = [r["identifier"] for r in md.get("communities", [])] + \
                [c.strip() for c in nfilter(args.communities.split(','))] + \
                COMMUNITIES
            if communities and not args.debug:
                md['communities'] = [{
                    "identifier": community_id
                } for community_id in sorted(set(communities))]
            md.update({
                'title':
                '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'dataset',
                'publication_date':
                datetime.today().strftime('%Y-%m-%d'),
                'version':
                version,
                'related_identifiers': [
                    {
                        'scheme': 'url',
                        'identifier':
                        '{0}/tree/{1}'.format(git_url, version_v),
                        'relation': 'isSupplementTo'
                    },
                ],
            })
            if args.parent_doi:
                md['related_identifiers'].append({
                    'scheme': 'doi',
                    'identifier': args.parent_doi,
                    'relation': 'isPartOf'
                })
                supplement_to = " - Supplement to dataset " \
                                "<a href='https://doi.org/{0}'>{1}</a> ".format(
                    args.parent_doi, ds.metadata.title)  # noqa: E122
            if ds.metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    ds.metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })

            formats = ', '.join(sorted(used_file_extensions))
            descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else ''
            online_url, online = '', ''
            if ds.metadata.url:
                online_url = ds.metadata.url
                online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format(
                    online_url)
            md['description'] = html.escape(
                DESCRIPTION.format(
                    url=online_url,
                    formats=' ({0})'.format(formats) if formats else '',
                    title=md['title'],
                    supplement_to=supplement_to,
                    descr=descr,
                    online=online))

            license_md = ''
            if ds.metadata.zenodo_license:
                md['license'] = {'id': ds.metadata.zenodo_license}
                license_md = LICENCE.format(ds.metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                README.format(
                    title=md['title'],
                    doi='https://doi.org/{0}'.format(args.parent_doi),
                    ds_title=ds.metadata.title,
                    license=license_md,
                    formats=' ({0})'.format(formats) if formats else '',
                    media=MEDIA,
                    index=INDEX_CSV))

    if args.update_zenodo:

        md = {}
        md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME))

        if args.debug:
            api_url = API_URL_SANDBOX
            access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN')
        else:
            api_url = API_URL
            access_token = ACCESS_TOKEN
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=access_token)
        try:
            rec = api.record_from_id('{0}record/{1}'.format(
                zenodo_url, args.update_zenodo))
        except Exception as e:
            args.log.error(
                'Check connection and credentials for accessing Zenodo.\n{0}'.
                format(e))
            return
        latest_version = rec.links['latest'].split('/')[-1]
        if latest_version != args.update_zenodo:
            args.log.warn(
                'Passed deposit ID does not refer to latest version {0}!'.
                format(latest_version))
        args.log.info('  DOI:     ' + rec.metadata.doi)
        args.log.info('  Title:   ' + rec.metadata.title)
        args.log.info('  Version: ' + rec.metadata.version)
        args.log.info('  Date:    ' + rec.metadata.publication_date)
        args.log.info('  Files:   ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')
Пример #14
0
 def index(self):
     with update(self.path('datasets.json'), default=[], indent=4) as data:
         return [Dataset(md) for md in data]
Пример #15
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    recreate_treeclosure()

    for lpk, mas in DBSession.execute("""\
select
  l.pk, array_agg(distinct lma.macroarea_pk)
from
  language as l,
  treeclosuretable as t,
  languoidmacroarea as lma,
  macroarea as ma
where
  l.pk = t.parent_pk and
  t.child_pk = lma.languoid_pk and
  lma.macroarea_pk = ma.pk and
  l.pk not in (select languoid_pk from languoidmacroarea)
group by l.pk"""):
        for mapk in mas:
            DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk))

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = args.args[0]

    def items(s):
        if not s:
            return set()
        r = []
        for ss in set(s.strip().split()):
            if '**:' in ss:
                ss = ss.split('**:')[0] + '**'
            if ss.endswith(','):
                ss = ss[:-1].strip()
            r.append(ss)
        return set(r)

    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}
    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}

    for lang in args.repos.languoids():
        if lang.category == models.BOOKKEEPING:
            continue
        clf = lang.classification_comment
        if clf:
            if clf.subrefs:
                if items(lang.cfg['classification']['subrefs']) != \
                        items(lang.cfg['classification'].get('sub')):
                    vspk = valuesets['sc-{0}'.format(lang.id)]
                    for ref in clf.subrefs:
                        spk = refs.get(ref.key)
                        DBSession.add(
                            common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
Пример #16
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    recreate_treeclosure()

    for lpk, mas in DBSession.execute("""\
select
  l.pk, array_agg(distinct lma.macroarea_pk)
from
  language as l,
  treeclosuretable as t,
  languoidmacroarea as lma,
  macroarea as ma
where
  l.pk = t.parent_pk and
  t.child_pk = lma.languoid_pk and
  lma.macroarea_pk = ma.pk and
  l.pk not in (select languoid_pk from languoidmacroarea)
group by l.pk"""):
        for mapk in mas:
            DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk))

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    version = assert_release(args.repos.repos)
    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = version

    def items(s):
        if not s:
            return set()
        r = []
        for ss in set(s.strip().split()):
            if '**:' in ss:
                ss = ss.split('**:')[0] + '**'
            if ss.endswith(','):
                ss = ss[:-1].strip()
            r.append(ss)
        return set(r)

    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}
    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}

    for lang in args.repos.languoids():
        if lang.category == models.BOOKKEEPING:
            continue
        clf = lang.classification_comment
        if clf:
            if clf.subrefs:
                if items(lang.cfg['classification']['subrefs']) != \
                        items(lang.cfg['classification'].get('sub')):
                    vspk = valuesets['sc-{0}'.format(lang.id)]
                    for ref in clf.subrefs:
                        spk = refs.get(ref.key)
                        DBSession.add(
                            common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
            if clf.familyrefs:
                if items(lang.cfg['classification']['familyrefs']) != \
                        items(lang.cfg['classification'].get('family')):
                    vspk = valuesets['fc-{0}'.format(lang.id)]
                    for ref in clf.familyrefs:
                        spk = refs.get(ref.key)
                        if spk:
                            DBSession.add(
                                common.ValueSetReference(source_pk=spk, valueset_pk=vspk))