def upload_sources(args): """ concepticon upload_sources path/to/cdstar/catalog """ toc = ['# Sources\n'] api = Concepticon(args.data) with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat: with Catalog(args.args[0], cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for fname in sorted(api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list( cat.create(fname, {'collection': 'concepticon'}))[0] spec = lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append('- [{0} [PDF {1}]]({2})'.format( key, format_size(spec['size']), spec['url'])) readme(api.data_path('sources'), toc)
def rename(args): # pragma: no cover api = Concepticon(args.repos) from_, to_ = args.args assert CONCEPTLIST_ID_PATTERN.match(to_) cl = api.conceptlists[from_] # write the adapted concept list to the new path: with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_), delimiter='\t') as writer: header = [] for i, row in enumerate(reader(cl.path, delimiter='\t')): if i == 0: header = row writer.writerow(row) header = {v: k for k, v in enumerate(header) } # Map col name to row index else: oid = row[header['ID']] assert oid.startswith(from_) nid = oid.replace(from_, to_) api.add_retirement( 'Concept', dict(id=oid, comment='renaming', replacement=nid)) row[header['ID']] = nid writer.writerow(row) # write adapted metadata to the new path: fname = cl.path.name.replace(from_, to_) + MD_SUFFIX md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX), object_pairs_hook=OrderedDict) md['tables'][0]['url'] = fname jsonlib.dump(md, cl.path.parent / fname, indent=4) # remove obsolete concept list and metadata: cl.path.unlink() cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink() # adapt conceptlists.tsv rows = [] for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'): rows.append([col.replace(from_, to_) if col else col for col in row]) with UnicodeWriter(api.data_path('conceptlists.tsv'), delimiter='\t') as writer: writer.writerows(rows) api.add_retirement('Conceptlist', dict(id=from_, comment='renaming', replacement=to_)) print("""Please run grep -r "{0}" concepticondata/ | grep -v retired.json to confirm the renaming was complete!""".format(from_))
def upload_sources(args): """ Compile sources and upload the result to GWDG CDSTAR instance. Notes ----- CDSTAR authorisation information should be supplied in the form of environment variables: - CDSTAR_URL - CDSTAR_USER - CDSTAR_PWD Examples -------- $ concepticon upload_sources path/to/cdstar/catalog """ catalog_path = args.args[0] if args.args else os.environ["CDSTAR_CATALOG"] toc = ["# Sources\n"] api = Concepticon(args.repos) with SourcesCatalog(api.data_path("sources", "cdstar.json")) as lcat: with Catalog( catalog_path, cdstar_url=os.environ["CDSTAR_URL"], cdstar_user=os.environ["CDSTAR_USER"], cdstar_pwd=os.environ["CDSTAR_PWD"], ) as cat: for fname in sorted(api.data_path("sources").glob("*.pdf"), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list( cat.create(fname, {"collection": "concepticon"}))[0] lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append("- [{0} [PDF {1}]]({2})".format( key, format_size(spec["size"]), spec["url"])) readme(api.data_path("sources"), toc) print(catalog_path)
def upload_sources(args): """ Compile sources and upload the result to GWDG CDSTAR instance. Notes ----- CDSTAR authorisation information should be supplied in the form of environment variables: - CDSTAR_URL - CDSTAR_USER - CDSTAR_PWD Examples -------- $ concepticon upload_sources path/to/cdstar/catalog """ catalog_path = args.args[0] if args.args else os.environ['CDSTAR_CATALOG'] toc = ['# Sources\n'] api = Concepticon(args.repos) with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat: with Catalog( catalog_path, cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for fname in sorted( api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list(cat.create(fname, {'collection': 'concepticon'}))[0] spec = lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append('- [{0} [PDF {1}]]({2})'.format( key, format_size(spec['size']), spec['url'])) readme(api.data_path('sources'), toc) print(catalog_path)
def link(args): """ Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added. concepticon link <concept-list> """ api = Concepticon(args.data) conceptlist = Path(args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): conceptlist = api.data_path('conceptlists', args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): raise ParserError('no file %s found' % args.args[0]) rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
def link(args): """ Link concepts to concept sets for a given concept list. Notes ----- If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added. Examples -------- $ concepticon link path_to_conceptlist.tsv """ api = Concepticon(args.repos) conceptlist = Path(args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): conceptlist = api.data_path('conceptlists', args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): raise ParserError('no file %s found' % args.args[0]) rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
def check(api=None): if not api: if not REPOS_PATH.exists(): return # pragma: no cover api = Concepticon(REPOS_PATH) # We collect all cite keys used to refer to references. all_refs = set() for meta in api.metadata.values(): cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns']) cnames_tsv = set(list(meta.values.values())[0]) if cnames_tsv - cnames_schema: # pragma: no cover error('column names in {0} but not in json-specs'.format(meta.id), 'name') for i, value in enumerate(meta.values.values()): if set(value.keys()) != cnames_schema: # pragma: no cover error('meta data {0} contains irregular number of columns in line {1}' .format(meta.id, i + 2), 'name') for ref in split(meta.meta.get('dc:references') or ''): all_refs.add(ref) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. for i, cl in enumerate(api.conceptlists.values()): for ref in cl.refs: if ref not in api.bibliography: # pragma: no cover error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2) all_refs.add(ref) refs_in_text = re.findall(BIB_PATTERN, cl.note) for ref in refs_in_text: all_refs.add(ref) # make also sure that all sources are accompanied by a PDF, but only write a # warning if this is not the case for ref in cl.pdf: if ref not in api.sources: # pragma: no cover warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv') all_refs.add('List2016a') for ref in api.bibliography: if ref not in all_refs: # pragma: no cover error('unused bibtex record: {0}'.format(ref), 'references.bib') ref_cols = { 'concepticon_id': set(api.conceptsets.keys()), 'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()), } for i, rel in enumerate(api.relations.raw): for attr, type_ in [ ('SOURCE', 'concepticon_id'), ('TARGET', 'concepticon_id'), ('SOURCE_GLOSS', 'concepticon_gloss'), ('TARGET_GLOSS', 'concepticon_gloss'), ]: if rel[attr] not in ref_cols[type_]: # pragma: no cover error( 'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2) for fname in api.data_path('conceptlists').glob('*.tsv'): if fname.stem not in api.conceptlists: # pragma: no cover error( 'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '') for cl in api.conceptlists.values(): for i, concept in enumerate(cl.concepts.values()): if i == 0: # pragma: no cover for lg in cl.source_language: if lg.lower() not in concept.cols: error('missing source language col %s' % lg.upper(), cl.id) for lg in cl.source_language: # pragma: no cover if not (concept.attributes.get(lg.lower()) or getattr(concept, lg.lower(), None) or (lg.lower() == 'english' and not concept.gloss)): error('missing source language translation %s' % lg, cl.id, i + 2) for attr, values in ref_cols.items(): val = getattr(concept, attr) if val and val not in values: # pragma: no cover error('invalid value for %s: %s' % (attr, val), cl.id, i + 2) sameas = {} glosses = set() for cs in api.conceptsets.values(): if cs.gloss in glosses: # pragma: no cover error('duplicate conceptset gloss: {0}'.format(cs.gloss), cs.id) glosses.add(cs.gloss) for target, rel in cs.relations.items(): if rel == 'sameas': for group in sameas.values(): if target in group: # pragma: no cover group.add(cs.id) break else: sameas[cs.gloss] = {cs.id, target} deprecated = {} for s in sameas.values(): csids = sorted(s, key=lambda j: int(j)) for csid in csids[1:]: assert csid not in deprecated deprecated[csid] = csids[0] for cl in api.conceptlists.values(): for concept in cl.concepts.values(): if concept.concepticon_id in deprecated: # pragma: no cover error('deprecated concept set {0} linked for {1}'.format( concept.concepticon_id, concept.id), cl.id) return SUCCESS
def test(): if not REPOS_PATH.exists(): return # pragma: no cover api = Concepticon(REPOS_PATH) # We collect all cite keys used to refer to references. all_refs = set() for meta in api.metadata.values(): cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns']) cnames_tsv = set(list(meta.values.values())[0]) if cnames_tsv - cnames_schema: # pragma: no cover error('column names in {0} but not in json-specs'.format(meta.id), 'name') for i, value in enumerate(meta.values.values()): if set(value.keys()) != cnames_schema: # pragma: no cover error('meta data {0} contains irregular number of columns in line {1}' .format(meta.id, i + 2), 'name') for ref in split(meta.meta.get('dc:references') or ''): all_refs.add(ref) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. for i, cl in enumerate(api.conceptlists.values()): for ref in cl.refs: if ref not in api.bibliography: # pragma: no cover error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2) all_refs.add(ref) refs_in_text = re.findall(BIB_PATTERN, cl.note) for ref in refs_in_text: all_refs.add(ref) # make also sure that all sources are accompanied by a PDF, but only write a # warning if this is not the case for ref in cl.pdf: if ref not in api.sources: # pragma: no cover warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv') for ref in api.bibliography: if ref not in all_refs: # pragma: no cover error('unused bibtex record: {0}'.format(ref), 'references.bib') ref_cols = { 'concepticon_id': set(api.conceptsets.keys()), 'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()), } for i, rel in enumerate(api.relations.raw): for attr, type_ in [ ('SOURCE', 'concepticon_id'), ('TARGET', 'concepticon_id'), ('SOURCE_GLOSS', 'concepticon_gloss'), ('TARGET_GLOSS', 'concepticon_gloss'), ]: if rel[attr] not in ref_cols[type_]: # pragma: no cover error( 'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2) for fname in api.data_path('conceptlists').glob('*.tsv'): if fname.stem not in api.conceptlists: # pragma: no cover error( 'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '') for cl in api.conceptlists.values(): for i, concept in enumerate(cl.concepts.values()): if i == 0: # pragma: no cover for lg in cl.source_language: if lg.lower() not in concept.cols: error('missing source language col %s' % lg.upper(), cl.id) for lg in cl.source_language: # pragma: no cover if not (concept.attributes.get(lg.lower()) or getattr(concept, lg.lower(), None)): error('missing source language translation %s' % lg, cl.id, i + 2) for attr, values in ref_cols.items(): val = getattr(concept, attr) if val and val not in values: # pragma: no cover error('invalid value for %s: %s' % (attr, val), cl.id, i + 2) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')