Exemplo n.º 1
0
def run(args):
    if args.citations:  # pragma: no cover
        for dataset in args.repos.index:
            print('> {}\n'.format(dataset.citation))
        return

    if args.datasets_only:
        with Table(args, 'dataset', 'files', 'size') as t:
            totalfiles, totalsize = 0, 0
            for dataset in args.repos.index:
                totalfiles += len(dataset.files)
                totalsize += sum(f.size for f in dataset.files)
                t.append([
                    dataset.name,
                    len(dataset.files),
                    format_size(sum(f.size for f in dataset.files))])
            t.append([
                'total: {} datasets'.format(len(args.repos.index)),
                totalfiles,
                format_size(totalsize)])
        return

    if args.index:  # pragma: no cover
        args.format = 'pipe'
        print("""# Content

[georoc.sqlite.gz](georoc.sqlite.gz) contains data from
[GEOROC's precompiled datasets](https://data.goettingen-research-online.de/dataverse/digis)
as listed below.
""")

    with Table(args, 'file', 'dataset', 'size', 'last modified') as t:
        if args.samples:
            t.columns.append('# samples')
        if args.references:
            t.columns.append('# references')
        t.columns.append('path')
        for ds in args.repos.index:
            if not args.dataset or (args.dataset in ds.name):
                for f in ds.files:
                    row = [
                        '[{}]({})'.format(f.id, f.md['pidURL']),
                        ds.name,
                        format_size(f.size),
                        f.date
                    ]
                    if args.samples:
                        row.append(len(list(f.iter_samples(args.repos, stdout=None))))
                    if args.references:
                        row.append(len(list(f.iter_references(args.repos))))
                    row.append(f.name)
                    t.append(row)
Exemplo n.º 2
0
def upload_sources(args):
    """
    concepticon upload_sources path/to/cdstar/catalog
    """
    toc = ['# Sources\n']
    api = Concepticon(args.data)
    with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat:
        with Catalog(args.args[0],
                     cdstar_url=os.environ['CDSTAR_URL'],
                     cdstar_user=os.environ['CDSTAR_USER'],
                     cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
            for fname in sorted(api.data_path('sources').glob('*.pdf'),
                                key=lambda f: f.stem):
                clid = as_unicode(fname.stem)
                spec = lcat.get(clid)
                if not spec:
                    _, _, obj = list(
                        cat.create(fname, {'collection': 'concepticon'}))[0]
                    spec = lcat.add(clid, obj)

        for key in sorted(lcat.items):
            spec = lcat.get(key)
            toc.append('- [{0} [PDF {1}]]({2})'.format(
                key, format_size(spec['size']), spec['url']))

    readme(api.data_path('sources'), toc)
Exemplo n.º 3
0
def linked_image(obj, check=True):
    if check and maintype(obj) != 'image':
        raise ValueError('type mismatch: {0} and image'.format(maintype(obj)))
    return HTML.a(
        HTML.img(src=bitstream_url(obj, 'web'), class_='image'),
        href=bitstream_url(obj),
        title="View image ({0})".format(format_size(obj.jsondata.get('size', 0))))
Exemplo n.º 4
0
def linked_image(obj, check=True):
    if check and maintype(obj) != 'image':
        raise ValueError('type mismatch: {0} and image'.format(maintype(obj)))
    return HTML.a(
        HTML.img(src=bitstream_url(obj, 'web'), class_='image'),
        href=bitstream_url(obj),
        title="View image ({0})".format(format_size(obj.jsondata.get('size', 0))))
Exemplo n.º 5
0
def format_file(f, with_mime_type=True):
    icon_ = {
        'image': 'camera',
        'video': 'facetime-video',
    }.get(f.mime_type.split('/')[0], 'file')
    if with_mime_type:
        label = f.mime_type + '; '
    else:
        label = ''
    label = ' [%s%s]' % (label, misc.format_size(f.jsondata['size']))
    return HTML.span(icon(icon_, inverted=True), label, class_='badge')
Exemplo n.º 6
0
Arquivo: util.py Projeto: clld/ldh
def file_link(file):
    url = file.jsondata.get('url', 'http://hdl.handle.net/' + file.id.replace('__', '/'))
    suffix = pathlib.Path(url.split('/')[-1]).suffix
    content = [
        HTML.a(
            icon('file'),
            '{} ({})'.format(
                suffix[1:].upper() if suffix else 'PDF', format_size(file.jsondata['size'])),
            href=url,
        )]
    license = file.jsondata['license']
    if license:
        content.append(' licensed under ')
        content.append(external_link(
            license['url'], label=license['id'].upper(), title=license['name']))
    return HTML.span(*content)
Exemplo n.º 7
0
def upload_sources(args):
    """
    Compile sources and upload the result to GWDG CDSTAR instance.

    Notes
    -----
    CDSTAR authorisation information should be supplied in the form of
    environment variables:
        - CDSTAR_URL
        - CDSTAR_USER
        - CDSTAR_PWD

    Examples
    --------
    $ concepticon upload_sources path/to/cdstar/catalog
    """
    catalog_path = args.args[0] if args.args else os.environ["CDSTAR_CATALOG"]
    toc = ["# Sources\n"]
    api = Concepticon(args.repos)
    with SourcesCatalog(api.data_path("sources", "cdstar.json")) as lcat:
        with Catalog(
                catalog_path,
                cdstar_url=os.environ["CDSTAR_URL"],
                cdstar_user=os.environ["CDSTAR_USER"],
                cdstar_pwd=os.environ["CDSTAR_PWD"],
        ) as cat:
            for fname in sorted(api.data_path("sources").glob("*.pdf"),
                                key=lambda f: f.stem):
                clid = as_unicode(fname.stem)
                spec = lcat.get(clid)
                if not spec:
                    _, _, obj = list(
                        cat.create(fname, {"collection": "concepticon"}))[0]
                    lcat.add(clid, obj)

        for key in sorted(lcat.items):
            spec = lcat.get(key)
            toc.append("- [{0} [PDF {1}]]({2})".format(
                key, format_size(spec["size"]), spec["url"]))

    readme(api.data_path("sources"), toc)
    print(catalog_path)
Exemplo n.º 8
0
def link(obj, label=None, with_mime_type=True, badge=False):
    label = label or 'View file'
    mtype = mimetype(obj)
    icon_ = MIMETYPE_TO_ICON.get(
        mtype, MIMETYPE_TO_ICON.get(maintype(obj, mimetype_=mtype), 'download-alt'))
    md = ''
    if obj.jsondata.get('size'):
        md = format_size(obj.jsondata['size'])
    if with_mime_type:
        if md:
            md += ', '
        md += mtype
    if md:
        label += ' (%s)' % md
    return HTML.a(
        HTML.span(
            icon(icon_, inverted=badge),
            ' ' + label,
            class_='badge' if badge else 'cdstar_link'),
        href=bitstream_url(obj))
Exemplo n.º 9
0
def link(obj, label=None, with_mime_type=True, badge=False):
    label = label or 'View file'
    mtype = mimetype(obj)
    icon_ = MIMETYPE_TO_ICON.get(
        mtype, MIMETYPE_TO_ICON.get(maintype(obj, mimetype_=mtype), 'download-alt'))
    md = ''
    if obj.jsondata.get('size'):
        md = format_size(obj.jsondata['size'])
    if with_mime_type:
        if md:
            md += ', '
        md += mtype
    if md:
        label += ' (%s)' % md
    return HTML.a(
        HTML.span(
            icon(icon_, inverted=badge),
            ' ' + label,
            class_='badge' if badge else 'cdstar_link'),
        href=bitstream_url(obj))
Exemplo n.º 10
0
def upload_sources(args):
    """
    Compile sources and upload the result to GWDG CDSTAR instance.

    Notes
    -----
    CDSTAR authorisation information should be supplied in the form of
    environment variables:
        - CDSTAR_URL
        - CDSTAR_USER
        - CDSTAR_PWD

    Examples
    --------
    $ concepticon upload_sources path/to/cdstar/catalog
    """
    catalog_path = args.args[0] if args.args else os.environ['CDSTAR_CATALOG']
    toc = ['# Sources\n']
    api = Concepticon(args.repos)
    with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat:
        with Catalog(
                catalog_path,
                cdstar_url=os.environ['CDSTAR_URL'],
                cdstar_user=os.environ['CDSTAR_USER'],
                cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
            for fname in sorted(
                    api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem):
                clid = as_unicode(fname.stem)
                spec = lcat.get(clid)
                if not spec:
                    _, _, obj = list(cat.create(fname, {'collection': 'concepticon'}))[0]
                    spec = lcat.add(clid, obj)

        for key in sorted(lcat.items):
            spec = lcat.get(key)
            toc.append('- [{0} [PDF {1}]]({2})'.format(
                key, format_size(spec['size']), spec['url']))

    readme(api.data_path('sources'), toc)
    print(catalog_path)
Exemplo n.º 11
0
 def size_h(self):
     return format_size(getattr(self, 'size', 0))
Exemplo n.º 12
0
 def size(self, req):
     _path = self.abspath(req)
     if _path.exists():
         return format_size(_path.stat().st_size)
Exemplo n.º 13
0
 def size_h(self):
     return format_size(self.size)
Exemplo n.º 14
0
 def size(self, req):
     _path = self.abspath(req)
     if _path.exists():
         return format_size(_path.stat().st_size)
Exemplo n.º 15
0
def run(args):
    ds = Dataset().cldf_reader()

    release_dir = args.out / '{0}_audio'.format(Dataset().id)
    zenodo_file_name = 'zenodo.json'

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']}
        audio = args.out / 'audio'
        audio.mkdir(exist_ok=True)

    if not args.update_zenodo:
        for row in tqdm.tqdm([r for r in ds['media.csv']]):
            if args.list:
                size[row['mimetype']] += int(row['size'])
                number.update([row['mimetype']])
            else:
                d = audio / f2c[row['Form_ID']]
                d.mkdir(exist_ok=True)
                url = ds.get_row_url('media.csv', row)
                target = d / '{}.{}'.format(row['ID'], url.split('.')[-1])
                if (not target.exists()) or md5(target) != row['ID']:
                    if (args.mimetype is None) or target.suffix.endswith(
                            args.mimetype):
                        create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k, str(number[k]), format_size(v)]))

    if args.create_release:
        assert audio.exists(), 'No folder "audio" found in {0}'.format(
            audio.resolve())

        release_dir.mkdir(exist_ok=True)

        args.log.info('creating audio ZIP archive per parameter folder ...')
        try:
            zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w',
                                   zipfile.ZIP_DEFLATED)
            fp = args.out
            for root, dirs, files in tqdm.tqdm(os.walk(audio)):
                for f in files:
                    if not f.startswith('.') and not f.startswith('__')\
                            and ((args.mimetype is None) or f.endswith(args.mimetype)):
                        zipf.write(os.path.join(root, f),
                                   os.path.relpath(os.path.join(root, f), fp))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        with jsonlib.update(release_dir / zenodo_file_name,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = Dataset().dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [contrib(p) for p in contributors]
            if COMMUNITIES:
                md['communities'] = [{
                    'id': community_id
                } for community_id in COMMUNITIES]
            md.update({
                'title':
                '{0} Audio Files'.format(Dataset().metadata.title),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'video',
                'version':
                VERSION,
                'related_identifiers': [
                    {
                        'scheme': 'doi',
                        'identifier': '10.5281/zenodo.4309141',
                        'relation': 'isPartOf'
                    },
                    {
                        'scheme':
                        'url',
                        'identifier':
                        '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX,
                                                  Dataset().id, VERSION),
                        'relation':
                        'isSupplementTo'
                    },
                ],
            })
            if Dataset().metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    Dataset().metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })
            md['description'] = html.escape(
                DESCRIPTION.format(
                    GITHUB_PREFIX,
                    Dataset().id,
                    Dataset().metadata.url if Dataset().metadata.url else '',
                    VERSION))

            license_md = ''
            if Dataset().metadata.zenodo_license:
                md['license'] = {'id': Dataset().metadata.zenodo_license}
                license_md = LISENCE.format(Dataset().metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                RELEASE_NOTE.format(md['title'], GITHUB_PREFIX,
                                    Dataset().id,
                                    Dataset().metadata.title, license_md))

    if args.update_zenodo:
        assert release_dir.exists()
        assert (release_dir / zenodo_file_name).exists()

        md = {}
        md.update(jsonlib.load(release_dir / zenodo_file_name))

        api_url = API_URL
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN)
        rec = api.record_from_id('{0}record/{1}'.format(
            zenodo_url, args.update_zenodo))
        args.log.info('  DOI:   ' + rec.metadata.doi)
        args.log.info('  Title: ' + rec.metadata.title)
        args.log.info('  Date:  ' + rec.metadata.publication_date)
        args.log.info('  Files: ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != zenodoclient.models.PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')
Exemplo n.º 16
0
def run(args):

    ds = get_dataset(args)
    ds_cldf = ds.cldf_reader()
    release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA)

    if ds_cldf.get('media.csv', None) is None:  # pragma: no cover
        args.log.error('Dataset has no media.csv')
        raise ParserError
    if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi):
        args.log.error('Invalid passed DOI')
        raise ParserError
    if args.update_zenodo:
        if not release_dir.exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir))
            raise ParserError
        if not (release_dir / ZENODO_FILE_NAME).exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir / ZENODO_FILE_NAME))
            raise ParserError
        if args.create_release:
            args.log.error(
                'You cannot create the release and update zenodo at the same time.'
            )
            raise ParserError
    if args.create_release:
        if not args.parent_doi:
            args.log.error(
                'The corresponding DOI is required (via --parent-doi).')
            raise ParserError

    mime_types = None
    if args.mimetype:
        mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))]

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        media_dir = args.out / MEDIA
        media_dir.mkdir(exist_ok=True)
        media = []

    if not args.update_zenodo:
        used_file_extensions = set()
        with UnicodeWriter(media_dir /
                           INDEX_CSV if not args.list else None) as w:
            for i, row in enumerate(
                    tqdm.tqdm([r for r in ds_cldf['media.csv']],
                              desc='Getting {0} items'.format(MEDIA))):
                url = ds_cldf.get_row_url('media.csv', row)
                if isinstance(url, rfc3986.URIReference):
                    url = url.normalize().unsplit()
                    row['URL'] = url
                f_ext = url.split('.')[-1].lower()
                if args.debug and i > 500:
                    break
                if (mime_types is None) or f_ext in mime_types\
                        or any(row['mimetype'].startswith(x) for x in mime_types):
                    if args.list:
                        m = '{0} ({1})'.format(row['mimetype'], f_ext)
                        size[m] += int(row['size'])
                        number.update([m])
                    else:
                        used_file_extensions.add(f_ext.lower())
                        d = media_dir / row['ID'][:2]
                        d.mkdir(exist_ok=True)
                        fn = '.'.join([row['ID'], f_ext])
                        target = d / fn
                        row['local_path'] = pathlib.Path(row['ID'][:2]) / fn
                        if i == 0:
                            w.writerow(row)
                        w.writerow(row.values())
                        media.append(target)
                        if (not target.exists()) or md5(target) != row['ID']:
                            _create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k.ljust(20), str(number[k]), format_size(v)]))
        return

    # Waiting for the download threads to finish
    if 'download_threads' in globals():
        for t in download_threads:
            t.join()

    if args.create_release:
        assert media_dir.exists(), 'No folder "{0}" found in {1}'.format(
            MEDIA, media_dir.resolve())

        release_dir.mkdir(exist_ok=True)

        media.append(media_dir / INDEX_CSV)

        try:
            zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)),
                                   'w', zipfile.ZIP_DEFLATED)
            fp = args.out
            for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)):
                zipf.write(str(f), str(os.path.relpath(str(f), str(fp))))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def _contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        version_v = git_describe('.').split('-')[0]
        version = version_v.replace('v', '')
        git_url = [r for r in ds.repo.repo.remotes
                   if r.name == 'origin'][0].url.replace('.git', '')
        with jsonlib.update(release_dir / ZENODO_FILE_NAME,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = ds.dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [_contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [_contrib(p) for p in contributors]
            communities = [r["identifier"] for r in md.get("communities", [])] + \
                [c.strip() for c in nfilter(args.communities.split(','))] + \
                COMMUNITIES
            if communities and not args.debug:
                md['communities'] = [{
                    "identifier": community_id
                } for community_id in sorted(set(communities))]
            md.update({
                'title':
                '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'dataset',
                'publication_date':
                datetime.today().strftime('%Y-%m-%d'),
                'version':
                version,
                'related_identifiers': [
                    {
                        'scheme': 'url',
                        'identifier':
                        '{0}/tree/{1}'.format(git_url, version_v),
                        'relation': 'isSupplementTo'
                    },
                ],
            })
            if args.parent_doi:
                md['related_identifiers'].append({
                    'scheme': 'doi',
                    'identifier': args.parent_doi,
                    'relation': 'isPartOf'
                })
                supplement_to = " - Supplement to dataset " \
                                "<a href='https://doi.org/{0}'>{1}</a> ".format(
                    args.parent_doi, ds.metadata.title)  # noqa: E122
            if ds.metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    ds.metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })

            formats = ', '.join(sorted(used_file_extensions))
            descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else ''
            online_url, online = '', ''
            if ds.metadata.url:
                online_url = ds.metadata.url
                online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format(
                    online_url)
            md['description'] = html.escape(
                DESCRIPTION.format(
                    url=online_url,
                    formats=' ({0})'.format(formats) if formats else '',
                    title=md['title'],
                    supplement_to=supplement_to,
                    descr=descr,
                    online=online))

            license_md = ''
            if ds.metadata.zenodo_license:
                md['license'] = {'id': ds.metadata.zenodo_license}
                license_md = LICENCE.format(ds.metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                README.format(
                    title=md['title'],
                    doi='https://doi.org/{0}'.format(args.parent_doi),
                    ds_title=ds.metadata.title,
                    license=license_md,
                    formats=' ({0})'.format(formats) if formats else '',
                    media=MEDIA,
                    index=INDEX_CSV))

    if args.update_zenodo:

        md = {}
        md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME))

        if args.debug:
            api_url = API_URL_SANDBOX
            access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN')
        else:
            api_url = API_URL
            access_token = ACCESS_TOKEN
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=access_token)
        try:
            rec = api.record_from_id('{0}record/{1}'.format(
                zenodo_url, args.update_zenodo))
        except Exception as e:
            args.log.error(
                'Check connection and credentials for accessing Zenodo.\n{0}'.
                format(e))
            return
        latest_version = rec.links['latest'].split('/')[-1]
        if latest_version != args.update_zenodo:
            args.log.warn(
                'Passed deposit ID does not refer to latest version {0}!'.
                format(latest_version))
        args.log.info('  DOI:     ' + rec.metadata.doi)
        args.log.info('  Title:   ' + rec.metadata.title)
        args.log.info('  Version: ' + rec.metadata.version)
        args.log.info('  Date:    ' + rec.metadata.publication_date)
        args.log.info('  Files:   ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')
Exemplo n.º 17
0
def test_format_size():
    from clldutils.misc import format_size

    for i in range(10):
        assert format_size(1000 ** i)
Exemplo n.º 18
0
 def bitstream_link(oid, spec):
     url = cdstar.SERVICE_URL.path('/bitstreams/{0}/{1}'.format(
         oid, spec['bitstreamid'])).as_string()
     return HTML.a('{0} [{1}]'.format(spec['bitstreamid'],
                                      format_size(spec['filesize'])),
                   href=url)
Exemplo n.º 19
0
 def bitstream_link(oid, spec):
     url = SERVICE_URL.path(
         '{0}/{1}'.format(oid, spec['bitstreamid'])).as_string()
     return HTML.a(
         '{0} [{1}]'.format(spec['bitstreamid'], format_size(spec['filesize'])),
         href=url)
Exemplo n.º 20
0
def test_format_size():
    from clldutils.misc import format_size

    for i in range(10):
        assert format_size(1000**i)
Exemplo n.º 21
0
def format_size(f):
    return misc.format_size(f.jsondata['size'])