def __init__(self, obj): CldfDataset.__init__(self, obj) self.concepticon = {} if Concepticon: concepticon = Concepticon(_venvs.joinpath('concepticon', 'concepticon-data')) for concept in concepticon.conceptlist('Haspelmath-2009-1460'): self.concepticon[concept['WOLD_ID']] = concept['CONCEPTICON_ID']
def upload_sources(args): """ concepticon upload_sources path/to/cdstar/catalog """ toc = ['# Sources\n'] api = Concepticon(args.data) with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat: with Catalog(args.args[0], cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for fname in sorted(api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list( cat.create(fname, {'collection': 'concepticon'}))[0] spec = lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append('- [{0} [PDF {1}]]({2})'.format( key, format_size(spec['size']), spec['url'])) readme(api.data_path('sources'), toc)
def map_concepts(args): api = Concepticon(args.data) api.map(Path(args.args[0]), otherlist=args.args[1] if len(args.args) > 1 else None, out=args.output, full_search=args.full_search, language=args.language)
def link(args): """ Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added. concepticon link <concept-list> """ api = Concepticon(args.data) conceptlist = Path(args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): conceptlist = api.data_path('conceptlists', args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): raise ParserError('no file %s found' % args.args[0]) rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
def coverage(args): from pyconcepticon.api import Concepticon varieties = defaultdict(set) def _word_length(ds, **kw): ds.coverage(varieties) with_dataset(args, _word_length) print('varieties', len(varieties)) c = Concepticon(args.concepticon_repos) res = Counter() for cl in c.conceptlists.values(): try: concepts = set( int(cc.concepticon_id) for cc in cl.concepts.values() if cc.concepticon_id) except: continue for varid, meanings in varieties.items(): if concepts.issubset(meanings): res.update([cl['ID']]) t = Table('concept list', 'variety count') for p in res.most_common(): t.append(list(p)) print(t.render(tablefmt='simple', condensed=False))
def word_length(args): from pyconcepticon.api import Concepticon c = Concepticon(args.concepticon_repos) res = defaultdict(lambda: defaultdict(list)) def _word_length(ds, **kw): ds.word_length(res) with_dataset(args, _word_length) concepts = c.conceptsets languoids = {l.id: l for l in Glottolog(args.glottolog_repos).languoids()} with UnicodeWriter('wordlength.csv') as writer: writer.writerow([ 'Concepticon_ID', 'Gloss', 'Semanticfield', 'Category', 'Glottocode', 'Variety', 'Family', 'Form', 'Length' ]) for pid, langs in res.items(): if len(langs) >= 500: for (lang, variety), forms in langs.items(): if lang in languoids: lengths = [len(f.split()) for f in forms] lang = languoids[lang] family = lang.lineage[0][0] if lang.lineage else '' c = concepts[pid] writer.writerow([ pid, c['GLOSS'], c['SEMANTICFIELD'], c['ONTOLOGICAL_CATEGORY'], lang.id, variety, family, forms[0], sum(lengths) / len(lengths) ])
def load(args): """ clics load /path/to/concepticon-data /path/to/glottolog """ if len(args.args) != 2: raise ParserError( 'concepticon and glottolog repos locations must be specified!') concepticon = Path(args.args[0]) if not concepticon.exists(): raise ParserError('concepticon repository does not exist') glottolog = Path(args.args[1]) if not glottolog.exists(): raise ParserError('glottolog repository does not exist') args.api.db.create(exists_ok=True) args.log.info('loading datasets into {0}'.format(args.api.db.fname)) in_db = args.api.db.datasets for ds in iter_datasets(): if args.unloaded and ds.id in in_db: args.log.info('skipping {0} - already loaded'.format(ds.id)) continue args.log.info('loading {0}'.format(ds.id)) args.api.db.load(ds) args.log.info('loading Concepticon data') args.api.db.load_concepticon_data(Concepticon(str(concepticon))) args.log.info('loading Glottolog data') args.api.db.load_glottolog_data(Glottolog(str(glottolog))) return
def load_concepticon(): concepticon = dict([ (line['ID'], line) for line in Concepticon().conceptsets() ]) return concepticon
def load_concepticon(): concepticon = dict([ (line.id, line) for line in Concepticon().conceptsets.values() ]) return concepticon
def lookup(args): """ Looks up a single gloss from the commandline. concepticon lookup <gloss1 gloss2 ... glossN> """ api = Concepticon() found = api.lookup(args.args, language=args.language, full_search=args.full_search, similarity_level=args.similarity) with UnicodeWriter(None, delimiter='\t') as writer: writer.writerow( ['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for f in found: writer.writerow(f) print(writer.read().decode('utf-8'))
def _set_operation(args, type_): res = list(Concepticon(args.repos)._set_operation(type_, *args.args)) if res: frmt = "{0:3} {1:1}{2:" + str(max(len(r[2]) for r in res)) + "} [{3:4}] {4}" for i, line in enumerate(res): print(frmt.format(i + 1, line[0], line[2], line[1], line[3])) return res
def rename(args): # pragma: no cover api = Concepticon(args.repos) from_, to_ = args.args assert CONCEPTLIST_ID_PATTERN.match(to_) cl = api.conceptlists[from_] # write the adapted concept list to the new path: with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_), delimiter='\t') as writer: header = [] for i, row in enumerate(reader(cl.path, delimiter='\t')): if i == 0: header = row writer.writerow(row) header = {v: k for k, v in enumerate(header) } # Map col name to row index else: oid = row[header['ID']] assert oid.startswith(from_) nid = oid.replace(from_, to_) api.add_retirement( 'Concept', dict(id=oid, comment='renaming', replacement=nid)) row[header['ID']] = nid writer.writerow(row) # write adapted metadata to the new path: fname = cl.path.name.replace(from_, to_) + MD_SUFFIX md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX), object_pairs_hook=OrderedDict) md['tables'][0]['url'] = fname jsonlib.dump(md, cl.path.parent / fname, indent=4) # remove obsolete concept list and metadata: cl.path.unlink() cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink() # adapt conceptlists.tsv rows = [] for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'): rows.append([col.replace(from_, to_) if col else col for col in row]) with UnicodeWriter(api.data_path('conceptlists.tsv'), delimiter='\t') as writer: writer.writerows(rows) api.add_retirement('Conceptlist', dict(id=from_, comment='renaming', replacement=to_)) print("""Please run grep -r "{0}" concepticondata/ | grep -v retired.json to confirm the renaming was complete!""".format(from_))
def notlinked(args): api = Concepticon(args.repos) i = 0 for _, cl in sorted(api.conceptlists.items(), key=lambda p: p[0]): for concept in sorted( cl.concepts.values(), key=lambda p: int(re.match('([0-9]+)', p.number).groups()[0])): if not concept.concepticon_id: candidates = [ c for c in list(api.lookup([concept.label]))[0] if c[3] < 3 ] if candidates: candidate = sorted(candidates, key=lambda c: c[3])[0] candidate = "{0} [{1}]".format(candidate[2], candidate[1]) i += 1 print("{0} {1.id}: {1.label}: {2}".format( i, concept, candidate))
def concept_coverage(): concepts = [h['concepticon_id'] for h in stdb_concepts().values()] concepticon = Concepticon() lists = ['Blust-2008-210', 'Comrie-1977-207', 'Matisoff-1978-200'] for l in lists: cids = [c.concepticon_id for c in concepticon.conceptlists[l].concepts.values()] olap = len([x for x in concepts if x in cids]) print('*', l, olap)
def attributes(args): """Calculate the addditional attributes in the lists.""" api = Concepticon(args.data) attrs = Counter() for cl in api.conceptlists.values(): attrs.update(cl.attributes) print(tabulate(list(attrs.most_common()), headers=( 'Attribute', 'Occurrences')))
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path(os.path.expanduser('~')).joinpath('venvs/lexibank/lexibank-data') with transaction.manager: dataset = common.Dataset( id=lexibank.__name__, name="lexibank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexibank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) glottolog = Glottolog( Path(lexibank.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} concepticon = Concepticon( Path(lexibank.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data')) conceptsets = {c['ID']: c for c in concepticon.conceptsets()} for dname in repos.joinpath('datasets').iterdir(): #if dname.name not in ['acbd']: # continue if dname.is_dir() and dname.name != '_template': #if dname.name != 'zenodo34092': # continue mdpath = dname.joinpath('metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families( Data(), DBSession.query(LexibankLanguage), glottolog=languoids, isolates_icon='tcccccc')
def upload_sources(args): """ Compile sources and upload the result to GWDG CDSTAR instance. Notes ----- CDSTAR authorisation information should be supplied in the form of environment variables: - CDSTAR_URL - CDSTAR_USER - CDSTAR_PWD Examples -------- $ concepticon upload_sources path/to/cdstar/catalog """ catalog_path = args.args[0] if args.args else os.environ["CDSTAR_CATALOG"] toc = ["# Sources\n"] api = Concepticon(args.repos) with SourcesCatalog(api.data_path("sources", "cdstar.json")) as lcat: with Catalog( catalog_path, cdstar_url=os.environ["CDSTAR_URL"], cdstar_user=os.environ["CDSTAR_USER"], cdstar_pwd=os.environ["CDSTAR_PWD"], ) as cat: for fname in sorted(api.data_path("sources").glob("*.pdf"), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list( cat.create(fname, {"collection": "concepticon"}))[0] lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append("- [{0} [PDF {1}]]({2})".format( key, format_size(spec["size"]), spec["url"])) readme(api.data_path("sources"), toc) print(catalog_path)
def stats(args): """ write statistics to README concepticon stats """ api = Concepticon(args.data) cls = api.conceptlists.values() readme_conceptlists(api, cls) readme_concept_list_meta(api) readme_concepticondata(api, cls)
def link(args): """ Link concepts to concept sets for a given concept list. Notes ----- If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added. Examples -------- $ concepticon link path_to_conceptlist.tsv """ api = Concepticon(args.repos) conceptlist = Path(args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): conceptlist = api.data_path('conceptlists', args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): raise ParserError('no file %s found' % args.args[0]) rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
def lookup(args): """ Look up the specified glosses in Concepticon. Examples -------- $ concepticon lookup gloss1 gloss2 gloss3 ... """ api = Concepticon() found = api.lookup( args.args, language=args.language, full_search=args.full_search, similarity_level=args.similarity) with UnicodeWriter(None) as writer: writer.writerow(['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for matches in found: for m in matches: writer.writerow(m) print(writer.read().decode('utf-8'))
def upload_sources(args): """ Compile sources and upload the result to GWDG CDSTAR instance. Notes ----- CDSTAR authorisation information should be supplied in the form of environment variables: - CDSTAR_URL - CDSTAR_USER - CDSTAR_PWD Examples -------- $ concepticon upload_sources path/to/cdstar/catalog """ catalog_path = args.args[0] if args.args else os.environ['CDSTAR_CATALOG'] toc = ['# Sources\n'] api = Concepticon(args.repos) with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat: with Catalog( catalog_path, cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for fname in sorted( api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list(cat.create(fname, {'collection': 'concepticon'}))[0] spec = lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append('- [{0} [PDF {1}]]({2})'.format( key, format_size(spec['size']), spec['url'])) readme(api.data_path('sources'), toc) print(catalog_path)
def stats(args): """ Generate new statistics for concepticondata/README.md. Examples -------- $ concepticon stats """ api = Concepticon(args.repos) cls = api.conceptlists.values() readme_conceptlists(api, cls) readme_concept_list_meta(api) readme_concepticondata(api, cls)
def map_concepts(args): """ Attempt an automatic mapping for a new concept list. Notes ----- In order for the automatic mapping to work, the new list has to be well-formed, i.e. in line with the requirments of Concepticon (GLOSS/ENGLISH column, see also CONTRIBUTING.md). Examples -------- $ concepticon map_concepts path_to_conceptlist.tsv """ api = Concepticon(args.repos) api.map( Path(args.args[0]), otherlist=args.args[1] if len(args.args) > 1 else None, out=args.output, full_search=args.full_search, language=args.language, skip_multiple=args.skip_multimatch)
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path( os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name= "Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath( 'concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families(Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def check(args): """ Identifies some issues with concept lists. -- i.e. multiple words with the same CONCEPTICON_ID or missing definitions concepticon check [CONCEPTLIST_ID]+ """ def _pprint(clist, error, _id, message): print("\t".join([ clist.ljust(30), error.ljust(10), '%5s' % _id, message])) def _get_mergers(api, clist): o = api.conceptlists[clist] # clashes clashes = defaultdict(list) for c in o.concepts: clashes[o.concepts[c].concepticon_id].append(c) if '' in clashes: clashes.pop('') for c in sorted([c for c in clashes if len(clashes[c]) > 1]): matches = [m for m in o.concepts if o.concepts[m].concepticon_id == c] for i, m in enumerate(matches, 1): message = '#%d %s = "%s"' % ( i, api.conceptsets[c].gloss, getattr(o.concepts[m], 'english', '') ) _pprint(clist, 'MERGE', c, message) def _get_missing(api, clist): o = api.conceptlists[clist] missings = [c for c in o.concepts if o.concepts[c].concepticon_id == ""] concepts = api.conceptlists[clist].concepts for m in missings: _pprint(clist, 'MISSING', concepts[m].number, '"%s"' % concepts[m].english) api = Concepticon(args.data) # conceptlists to check if len(args.args): clists = [_ for _ in api.conceptlists if _ in args.args] else: clists = api.conceptlists # check for clist in clists: _get_missing(api, clist) _get_mergers(api, clist)
def test(args): # pragma: no cover """ Run a number of tests on all concept lists in Concepticon. Notes ----- Tests for issues with column names, file names, IDs, source availability, etc. Best run after you went through the whole procedure of adding a new list to Concepticon. Examples -------- $ concepticon test """ from pyconcepticon.check_data import check if check(Concepticon(args.repos)): args.log.info('all integrity tests passed: OK')
def recreate_linking_data(args): """ Regenerate pyconcepticon/data/map*. Notes ----- map* files contain lists of all concept-to-word-in-language mappings available within Concepticon. Examples -------- $ concepticon recreate_linking_data """ api = Concepticon(args.repos) for l in api.vocabularies['COLUMN_TYPES'].values(): if getattr(l, 'iso2', None): _write_linking_data(api, l)
def _set_operation(args, type_): assert type_ in ['union', 'intersection'] api = Concepticon(args.data) out, clen = [], 0 for c, lists in compare_conceptlists(api, *args.args): if type_ == 'union' \ or len(set([x[0] for x in lists if x[1] >= 0])) == len(args.args): marker = '*' if not len([0 for x in lists if x[1] == 0]) else '' out += [(marker, c, api.conceptsets[c].gloss, ', '.join([ '{0[3]} ({0[1]}, {0[0]})'.format(x) for x in lists if x[1] != 0 ]))] clen = len(out[-1][2]) if len(out[-1][2]) > clen else clen frmt = '{0:3} {1:1}{2:' + text_type(clen) + '} [{3:4}] {4}' for i, line in enumerate(out): print(frmt.format(i + 1, line[0], line[2], line[1], line[3])) return out
def cldf(self, **kw): self.glottolog_version = git_describe(kw['glottolog_repos']) self.concepticon_version = git_describe(kw['concepticon_repos']) try: bag = bagit.Bag(self.raw.parent.as_posix()) if not bag.is_valid(): if confirm( 'The downloaded data has changed. Update checksums?'): bag.save(manifests=True) assert bag.is_valid() else: raise bagit.BagError('invalid raw data') concepticon = Concepticon(kw['concepticon_repos']) if self.conceptlist: self.conceptlist = concepticon.conceptlists[self.conceptlist] self._run_command('cldf', concepticon, **kw) except bagit.BagError: self.log.error('invalid raw data for dataset %s' % self.id)
def attributes(args): """ Print all columns in concept lists that contain surplus information. Notes ----- Surplus information are columns not immediately required by Concepticon. Examples -------- $ concepticon attributes """ api = Concepticon(args.repos) attrs = Counter() for cl in api.conceptlists.values(): attrs.update(cl.attributes) print(tabulate(list(attrs.most_common()), headers=( 'Attribute', 'Occurrences')))
def validate(args): """ Checks for the availability of metadata for all concept lists. Notes ----- Concept lists have to be included in concepticondata/conceptlists in order to be considered. Examples -------- $ concepticon validate """ api = Concepticon(args.repos) for cl in api.conceptlists.values(): items = list(cl.metadata) if set(items[0].keys()) != \ set(c.name for c in cl.metadata.tableSchema.columns): print('unspecified column in concept list {0}'.format(cl.id))
def test(args): # pragma: no cover """ Run a number of tests on all concept lists in Concepticon. Notes ----- Tests for issues with column names, file names, IDs, source availability, etc. Best run after you went through the whole procedure of adding a new list to Concepticon. Examples -------- $ concepticon test """ if Concepticon(args.repos).check(*args.args): args.log.info("all integrity tests passed: OK") else: args.log.error("inconsistent data in repository {0}".format( args.repos))