def clean(args): """ Remove CLDF formatted data for given dataset. lexibank clean [DATASET_ID] """ with_dataset(args, Dataset._clean)
def bib(args): gbib = BibliographyData() def _harvest(ds, **kw): for bib in ds.cldf_dir.glob('*.bib'): bib = parse_file(str(bib)) for id_, entry in bib.entries.items(): id_ = '{0}:{1}'.format(ds.id, id_) if id_ not in gbib.entries: gbib.add_entry(id_, entry) with_dataset(args, _harvest, default_to_all=True) gbib.to_file( str(Path(args.cfg['paths']['lexibank']).joinpath('lexibank.bib')))
def diff(args): def _diff(ds, **kw): repo = ds.git_repo if repo and repo.is_dirty(): print('{0} at {1}'.format(colored(ds.id, 'blue', attrs=['bold']), colored(str(ds.dir), 'blue'))) for i, item in enumerate(repo.index.diff(None)): if i == 0: print(colored('modified:', attrs=['bold'])) print(colored(item.a_path, 'green')) for i, path in enumerate(repo.untracked_files): if i == 0: print(colored('untracked:', attrs=['bold'])) print(colored(path, 'green')) print() if not args.args: args.args = [ds.id for ds in args.cfg.datasets] with_dataset(args, _diff)
def coverage(args): # pragma: no cover from pyconcepticon.api import Concepticon varieties = defaultdict(set) glangs = defaultdict(set) concept_count = defaultdict(set) res80 = Counter() res85 = Counter() res90 = Counter() res80v = Counter() res85v = Counter() res90v = Counter() def _coverage(ds, **kw): ds.coverage(varieties, glangs, concept_count) with_dataset(args, _coverage) print('varieties', len(varieties)) concepticon = Concepticon(args.cfg['paths']['concepticon']) for cl in concepticon.conceptlists.values(): try: concepts = set( int(cc.concepticon_id) for cc in cl.concepts.values() if cc.concepticon_id) except: # noqa: E722 continue for varid, meanings in varieties.items(): # compute relative size of intersection instead! c = len(concepts.intersection(meanings)) / len(concepts) if c >= 0.8: res80v.update([cl.id]) if c >= 0.85: res85v.update([cl.id]) if c >= 0.9: res90v.update([cl.id]) for varid, meanings in glangs.items(): # compute relative size of intersection instead! c = len(concepts.intersection(meanings)) / len(concepts) if c >= 0.8: res80.update([cl.id]) if c >= 0.85: res85.update([cl.id]) if c >= 0.9: res90.update([cl.id]) def print_count(count): t = Table('concept list', 'glang count') for p in count.most_common(n=10): t.append(list(p)) print(t.render(tablefmt='simple', condensed=False)) print('\nGlottolog languages with coverage > 80%:') print_count(res80) print('\nGlottolog languages with coverage > 85%:') print_count(res85) print('\nGlottolog languages with coverage > 90%:') print_count(res90) print('\nVarieties with coverage > 80%:') print_count(res80v) print('\nVarieties with coverage > 85%:') print_count(res85v) print('\nVarieties with coverage > 90%:') print_count(res90v) print('\ntop-200 concepts:') t = Table('cid', 'gloss', 'varieties') for n, m in sorted([(cid, len(vars)) for cid, vars in concept_count.items()], key=lambda i: -i[1])[:200]: t.append([n, concepticon.conceptsets['%s' % n].gloss, m]) print(t.render(tablefmt='simple', condensed=False))
def check_phonotactics(args): """Check the segmented forms of a dataset""" with_dataset(args, Dataset._check_phonotactics)
def check_profile(args): """Check orthography of a dataset""" with_dataset(args, Dataset._check_profile)
def makecldf(args): """Convert a dataset into CLDF lexibank makecldf DATASET_ID """ with_dataset(args, Dataset._install)
def download(args): """Run a dataset's download command lexibank download DATASET_ID """ with_dataset(args, Dataset._download)
def unload(args): with_dataset(args, _unload, default_to_all=True)
from prompt_toolkit.auto_suggest import AutoSuggestFromHistory from prompt_toolkit.completion import Completer, Completion from termcolor import colored from appdirs import user_data_dir from clldutils.path import Path from clldutils.clilib import command from pylexibank.util import aligned from pylexibank.commands.util import with_dataset, _load, _unload from pylexibank.dataset import Dataset commands = { 'quit': lambda args: None, 'download': lambda args: with_dataset(args, Dataset._download), 'makecldf': lambda args: with_dataset(args, Dataset._install), 'dbload': lambda args: with_dataset(args, _load), 'dbunload': lambda args: with_dataset(args, _unload), 'orthography': lambda args: None, 'help': lambda args: print("Available Commands: \n%s" % aligned([ (k, getattr(v, '__doc__', '')) for k, v in sorted(commands.items()) ])), } commands['quit'].__doc__ = ': exits lexibank curator' commands['download'].__doc__ = "<dataset> : run <dataset>'s download method"