Exemplo n.º 1
0
def upload_sources(args):
    """
    concepticon upload_sources path/to/cdstar/catalog
    """
    toc = ['# Sources\n']
    api = Concepticon(args.data)
    with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat:
        with Catalog(args.args[0],
                     cdstar_url=os.environ['CDSTAR_URL'],
                     cdstar_user=os.environ['CDSTAR_USER'],
                     cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
            for fname in sorted(api.data_path('sources').glob('*.pdf'),
                                key=lambda f: f.stem):
                clid = as_unicode(fname.stem)
                spec = lcat.get(clid)
                if not spec:
                    _, _, obj = list(
                        cat.create(fname, {'collection': 'concepticon'}))[0]
                    spec = lcat.add(clid, obj)

        for key in sorted(lcat.items):
            spec = lcat.get(key)
            toc.append('- [{0} [PDF {1}]]({2})'.format(
                key, format_size(spec['size']), spec['url']))

    readme(api.data_path('sources'), toc)
Exemplo n.º 2
0
def rename(args):  # pragma: no cover
    api = Concepticon(args.repos)

    from_, to_ = args.args
    assert CONCEPTLIST_ID_PATTERN.match(to_)
    cl = api.conceptlists[from_]

    # write the adapted concept list to the new path:
    with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_),
                       delimiter='\t') as writer:
        header = []
        for i, row in enumerate(reader(cl.path, delimiter='\t')):
            if i == 0:
                header = row
                writer.writerow(row)
                header = {v: k
                          for k, v in enumerate(header)
                          }  # Map col name to row index
            else:
                oid = row[header['ID']]
                assert oid.startswith(from_)
                nid = oid.replace(from_, to_)
                api.add_retirement(
                    'Concept', dict(id=oid,
                                    comment='renaming',
                                    replacement=nid))
                row[header['ID']] = nid
                writer.writerow(row)

    # write adapted metadata to the new path:
    fname = cl.path.name.replace(from_, to_) + MD_SUFFIX
    md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX),
                      object_pairs_hook=OrderedDict)
    md['tables'][0]['url'] = fname
    jsonlib.dump(md, cl.path.parent / fname, indent=4)

    # remove obsolete concept list and metadata:
    cl.path.unlink()
    cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink()

    # adapt conceptlists.tsv
    rows = []
    for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'):
        rows.append([col.replace(from_, to_) if col else col for col in row])

    with UnicodeWriter(api.data_path('conceptlists.tsv'),
                       delimiter='\t') as writer:
        writer.writerows(rows)

    api.add_retirement('Conceptlist',
                       dict(id=from_, comment='renaming', replacement=to_))

    print("""Please run
grep -r "{0}" concepticondata/ | grep -v retired.json

to confirm the renaming was complete!""".format(from_))
Exemplo n.º 3
0
def upload_sources(args):
    """
    Compile sources and upload the result to GWDG CDSTAR instance.

    Notes
    -----
    CDSTAR authorisation information should be supplied in the form of
    environment variables:
        - CDSTAR_URL
        - CDSTAR_USER
        - CDSTAR_PWD

    Examples
    --------
    $ concepticon upload_sources path/to/cdstar/catalog
    """
    catalog_path = args.args[0] if args.args else os.environ["CDSTAR_CATALOG"]
    toc = ["# Sources\n"]
    api = Concepticon(args.repos)
    with SourcesCatalog(api.data_path("sources", "cdstar.json")) as lcat:
        with Catalog(
                catalog_path,
                cdstar_url=os.environ["CDSTAR_URL"],
                cdstar_user=os.environ["CDSTAR_USER"],
                cdstar_pwd=os.environ["CDSTAR_PWD"],
        ) as cat:
            for fname in sorted(api.data_path("sources").glob("*.pdf"),
                                key=lambda f: f.stem):
                clid = as_unicode(fname.stem)
                spec = lcat.get(clid)
                if not spec:
                    _, _, obj = list(
                        cat.create(fname, {"collection": "concepticon"}))[0]
                    lcat.add(clid, obj)

        for key in sorted(lcat.items):
            spec = lcat.get(key)
            toc.append("- [{0} [PDF {1}]]({2})".format(
                key, format_size(spec["size"]), spec["url"]))

    readme(api.data_path("sources"), toc)
    print(catalog_path)
Exemplo n.º 4
0
def upload_sources(args):
    """
    Compile sources and upload the result to GWDG CDSTAR instance.

    Notes
    -----
    CDSTAR authorisation information should be supplied in the form of
    environment variables:
        - CDSTAR_URL
        - CDSTAR_USER
        - CDSTAR_PWD

    Examples
    --------
    $ concepticon upload_sources path/to/cdstar/catalog
    """
    catalog_path = args.args[0] if args.args else os.environ['CDSTAR_CATALOG']
    toc = ['# Sources\n']
    api = Concepticon(args.repos)
    with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat:
        with Catalog(
                catalog_path,
                cdstar_url=os.environ['CDSTAR_URL'],
                cdstar_user=os.environ['CDSTAR_USER'],
                cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
            for fname in sorted(
                    api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem):
                clid = as_unicode(fname.stem)
                spec = lcat.get(clid)
                if not spec:
                    _, _, obj = list(cat.create(fname, {'collection': 'concepticon'}))[0]
                    spec = lcat.add(clid, obj)

        for key in sorted(lcat.items):
            spec = lcat.get(key)
            toc.append('- [{0} [PDF {1}]]({2})'.format(
                key, format_size(spec['size']), spec['url']))

    readme(api.data_path('sources'), toc)
    print(catalog_path)
Exemplo n.º 5
0
def link(args):
    """
    Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or
    CONCEPTICON_ID is given, the other is added.

    concepticon link <concept-list>
    """
    api = Concepticon(args.data)
    conceptlist = Path(args.args[0])
    if not conceptlist.exists() or not conceptlist.is_file():
        conceptlist = api.data_path('conceptlists', args.args[0])
        if not conceptlist.exists() or not conceptlist.is_file():
            raise ParserError('no file %s found' % args.args[0])

    rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
Exemplo n.º 6
0
def link(args):
    """
    Link concepts to concept sets for a given concept list.

    Notes
    -----
    If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added.

    Examples
    --------
    $ concepticon link path_to_conceptlist.tsv
    """
    api = Concepticon(args.repos)
    conceptlist = Path(args.args[0])
    if not conceptlist.exists() or not conceptlist.is_file():
        conceptlist = api.data_path('conceptlists', args.args[0])
        if not conceptlist.exists() or not conceptlist.is_file():
            raise ParserError('no file %s found' % args.args[0])

    rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
Exemplo n.º 7
0
def check(api=None):
    if not api:
        if not REPOS_PATH.exists():
            return  # pragma: no cover
        api = Concepticon(REPOS_PATH)

    # We collect all cite keys used to refer to references.
    all_refs = set()
    for meta in api.metadata.values():
        cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns'])
        cnames_tsv = set(list(meta.values.values())[0])
        if cnames_tsv - cnames_schema:  # pragma: no cover
            error('column names in {0} but not in json-specs'.format(meta.id), 'name')
        for i, value in enumerate(meta.values.values()):
            if set(value.keys()) != cnames_schema:  # pragma: no cover
                error('meta data {0} contains irregular number of columns in line {1}'
                      .format(meta.id, i + 2), 'name')
        for ref in split(meta.meta.get('dc:references') or ''):
            all_refs.add(ref)

    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    for i, cl in enumerate(api.conceptlists.values()):
        for ref in cl.refs:
            if ref not in api.bibliography:  # pragma: no cover
                error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2)
            all_refs.add(ref)
        refs_in_text = re.findall(BIB_PATTERN, cl.note)
        for ref in refs_in_text:
            all_refs.add(ref)

        # make also sure that all sources are accompanied by a PDF, but only write a
        # warning if this is not the case
        for ref in cl.pdf:
            if ref not in api.sources:  # pragma: no cover
                warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv')
    all_refs.add('List2016a')

    for ref in api.bibliography:
        if ref not in all_refs:  # pragma: no cover
            error('unused bibtex record: {0}'.format(ref), 'references.bib')

    ref_cols = {
        'concepticon_id': set(api.conceptsets.keys()),
        'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()),
    }

    for i, rel in enumerate(api.relations.raw):
        for attr, type_ in [
            ('SOURCE', 'concepticon_id'),
            ('TARGET', 'concepticon_id'),
            ('SOURCE_GLOSS', 'concepticon_gloss'),
            ('TARGET_GLOSS', 'concepticon_gloss'),
        ]:
            if rel[attr] not in ref_cols[type_]:  # pragma: no cover
                error(
                    'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2)

    for fname in api.data_path('conceptlists').glob('*.tsv'):
        if fname.stem not in api.conceptlists:  # pragma: no cover
            error(
                'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '')

    for cl in api.conceptlists.values():
        for i, concept in enumerate(cl.concepts.values()):
            if i == 0:  # pragma: no cover
                for lg in cl.source_language:
                    if lg.lower() not in concept.cols:
                        error('missing source language col %s' % lg.upper(), cl.id)

            for lg in cl.source_language:  # pragma: no cover
                if not (concept.attributes.get(lg.lower()) or
                        getattr(concept, lg.lower(), None) or
                        (lg.lower() == 'english' and not concept.gloss)):
                    error('missing source language translation %s' % lg, cl.id, i + 2)
            for attr, values in ref_cols.items():
                val = getattr(concept, attr)
                if val and val not in values:  # pragma: no cover
                    error('invalid value for %s: %s' % (attr, val), cl.id, i + 2)

    sameas = {}
    glosses = set()
    for cs in api.conceptsets.values():
        if cs.gloss in glosses:  # pragma: no cover
            error('duplicate conceptset gloss: {0}'.format(cs.gloss), cs.id)
        glosses.add(cs.gloss)
        for target, rel in cs.relations.items():
            if rel == 'sameas':
                for group in sameas.values():
                    if target in group:  # pragma: no cover
                        group.add(cs.id)
                        break
                else:
                    sameas[cs.gloss] = {cs.id, target}

    deprecated = {}
    for s in sameas.values():
        csids = sorted(s, key=lambda j: int(j))
        for csid in csids[1:]:
            assert csid not in deprecated
            deprecated[csid] = csids[0]

    for cl in api.conceptlists.values():
        for concept in cl.concepts.values():
            if concept.concepticon_id in deprecated:  # pragma: no cover
                error('deprecated concept set {0} linked for {1}'.format(
                    concept.concepticon_id, concept.id), cl.id)

    return SUCCESS
Exemplo n.º 8
0
def test():
    if not REPOS_PATH.exists():
        return  # pragma: no cover

    api = Concepticon(REPOS_PATH)

    # We collect all cite keys used to refer to references.
    all_refs = set()
    for meta in api.metadata.values():
        cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns'])
        cnames_tsv = set(list(meta.values.values())[0])
        if cnames_tsv - cnames_schema:  # pragma: no cover
            error('column names in {0} but not in json-specs'.format(meta.id), 'name')
        for i, value in enumerate(meta.values.values()):
            if set(value.keys()) != cnames_schema:  # pragma: no cover
                error('meta data {0} contains irregular number of columns in line {1}'
                      .format(meta.id, i + 2), 'name')
        for ref in split(meta.meta.get('dc:references') or ''):
            all_refs.add(ref)

    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    for i, cl in enumerate(api.conceptlists.values()):
        for ref in cl.refs:
            if ref not in api.bibliography:  # pragma: no cover
                error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2)
            all_refs.add(ref)
        refs_in_text = re.findall(BIB_PATTERN, cl.note)
        for ref in refs_in_text:
            all_refs.add(ref)

        # make also sure that all sources are accompanied by a PDF, but only write a
        # warning if this is not the case
        for ref in cl.pdf:
            if ref not in api.sources:  # pragma: no cover
                warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv')

    for ref in api.bibliography:
        if ref not in all_refs:  # pragma: no cover
            error('unused bibtex record: {0}'.format(ref), 'references.bib')

    ref_cols = {
        'concepticon_id': set(api.conceptsets.keys()),
        'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()),
    }

    for i, rel in enumerate(api.relations.raw):
        for attr, type_ in [
            ('SOURCE', 'concepticon_id'),
            ('TARGET', 'concepticon_id'),
            ('SOURCE_GLOSS', 'concepticon_gloss'),
            ('TARGET_GLOSS', 'concepticon_gloss'),
        ]:
            if rel[attr] not in ref_cols[type_]:  # pragma: no cover
                error(
                    'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2)

    for fname in api.data_path('conceptlists').glob('*.tsv'):
        if fname.stem not in api.conceptlists:  # pragma: no cover
            error(
                'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '')

    for cl in api.conceptlists.values():
        for i, concept in enumerate(cl.concepts.values()):
            if i == 0:  # pragma: no cover
                for lg in cl.source_language:
                    if lg.lower() not in concept.cols:
                        error('missing source language col %s' % lg.upper(), cl.id)

            for lg in cl.source_language:  # pragma: no cover
                if not (concept.attributes.get(lg.lower()) or
                        getattr(concept, lg.lower(), None)):
                    error('missing source language translation %s' % lg, cl.id, i + 2)
            for attr, values in ref_cols.items():
                val = getattr(concept, attr)
                if val and val not in values:  # pragma: no cover
                    error('invalid value for %s: %s' % (attr, val), cl.id, i + 2)

    if not SUCCESS:  # pragma: no cover
        raise ValueError('integrity checks failed!')