def run(args): dataset = get_dataset(args) with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md: modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()] contribs = dataset.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md["contributors"] = [contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] if communities: md['communities'] = [ {"identifier": community_id} for community_id in sorted(set(communities))] md.update( { "title": dataset.metadata.title, "access_right": "open", "keywords": sorted(set(md.get("keywords", []) + ["linguistics"] + modules)), "upload_type": "dataset", } ) if dataset.metadata.citation: md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \ "<blockquote>\n<p>{}</p>\n</blockquote>".format( html.escape(dataset.metadata.citation)) if dataset.metadata.zenodo_license: md['license'] = {'id': dataset.metadata.zenodo_license}
def run(args): ds = get_dataset(args) forms = [] for row in ds.cldf_reader()['FormTable']: if row['Language_ID'] == args.language_id or not args.language_id: forms.append(row) P = syllable_inventories(forms, format=args.prosody_format) bipa = args.clts.from_config().api.transcriptionsystem_dict['bipa'] table = [] if args.display == 'long': header = ['Language', 'Sound', 'Template', 'Frequency'] for language, data in P.items(): for sound, templates in data.items(): for template, frequency in templates.items(): table += [[language, sound, template, len(frequency)]] else: header = ['Language', 'Sound', 'Class', 'Frequency', 'Templates'] for language, data in P.items(): for sound, templates in data.items(): table += [[ language, sound, bipa[sound].type, sum([len(x) for x in templates.values()]), ', '.join([ '{0}:{1}'.format(x, len(y)) for x, y in templates.items() ]) ]] with Table(args, *header, rows=table): pass
def run(args): bipa = args.clts.api.bipa func = profile.simple_profile cols = ['Grapheme', 'IPA', 'Frequence', 'Codepoints'] kw = {'ref': 'form', 'clts': bipa} if args.context: func = profile.context_profile cols = [ 'Grapheme', 'IPA', 'Examples', 'Languages', 'Frequence', 'Codepoints' ] kw['col'] = 'language_id' ds = get_dataset(args) profile_path = ds.etc_dir / 'orthography.tsv' if profile_path.exists() and not args.force: raise ParserError( 'Orthography profile exists already. To overwrite, pass "-f" flag') header, D = [], {} for i, row in enumerate(ds.cldf_reader()['FormTable'], start=1): if i == 1: header = [f for f in row.keys() if f != 'ID'] D = {0: ['lid'] + [h.lower() for h in header]} row['Segments'] = ' '.join(row['Segments']) D[i] = [row['ID']] + [row[h] for h in header] with UnicodeWriter(profile_path, delimiter='\t') as writer: writer.writerow(cols) for row in func(Wordlist(D, row='parameter_id', col='language_id'), **kw): writer.writerow(row) args.log.info('Orthography profile written to {0}'.format(profile_path))
def run(args): dataset = get_dataset(args) if setup(dataset, force=args.test): if not args.test: # pragma: no cover print(git.cmd.Git(str(dataset.dir)).status()) print('You may include the following status badge in any markdown file in the repos:\n') print(build_status_badge(dataset))
def run(args): """ main function. """ ds = get_dataset(args) if args.medials: args.medials = set(args.medials.split(',')) errors = { 'length': defaultdict(list), 'syllable': defaultdict(list), 'missing': defaultdict(list) } if ds.cldf_dir.joinpath("forms.csv").exists(): for row in progressbar(ds.cldf_reader()["FormTable"], desc='iterate over wordlist'): if row['Language_ID'] == args.doculect or not args.doculect: strucs = get_structure(row['Segments'], medials=args.medials or MEDIALS) for i, (struc, segments) in enumerate( zip(strucs, morphemes(row['Segments']))): if len(struc) != len(segments): errors['length'][' '.join(segments), ' '.join(struc)] += [ (row['ID'], i, row['Language_ID'], row['Form'], row['Segments']) ] elif '?' in struc: errors['missing'][' '.join(segments), ' '.join(struc)] += [ (row['ID'], i, row['Language_ID'], row['Form'], row['Segments']) ] elif not 'n' in struc or not 't' in struc: errors['syllable'][' '.join(segments), ' '.join(struc)] += [ (row['ID'], i, row['Language_ID'], row['Form'], row['Segments']) ] for error, errorname in [('length', 'Length Errors'), ('missing', 'Missing Values'), ('syllable', 'Syllable Errors')]: if errors[error]: print('# ' + errorname + '\n') table = [] for i, ((segments, structure), examples) in enumerate(errors[error].items()): table += [[i + 1, segments, structure, len(examples)]] print( tabulate( table, tablefmt='pipe', headers=['Number', 'Segments', 'Structure', 'Examples'])) print('')
def run(args): """ Entry point for command-line call. """ # Extract dataset ds = get_dataset(args) # Read raw data and extend it with phonological information args.log.info("Loading data from %s...", ds) data = read_extended_data(ds, args) args.log.info("Read %i entries from CLDF.", len(data)) # Collect inventories args.log.info("Collecting inventories...") phoneme_count, syllable_count = collect_inventories(data) args.log.info("Read %i inventories.", len(phoneme_count)) # Collect inventories by size, testing the sample size needed args.log.info("Estimating sample sizes...") sampled = collect_sampled_inventories(data) args.log.info("Read %i inventories.", len(sampled)) # Estimate sample sizes, and compute the means for output ks_stats = defaultdict(list) size_stats = defaultdict(list) for lang, full in phoneme_count.items(): dist1 = [full.get(sound, None) for sound in sorted(full)] for sample_size in sampled: for i, sample in enumerate(sampled[sample_size][lang]): dist2 = [sample.get(sound, 0) for sound in sorted(full)] ks, p = scipy.stats.ks_2samp(dist1, dist2) ks_stats[lang, sample_size].append(ks) size_stats[lang, sample_size].append(len(sample)) ks_stats = {key: np.mean(ks_values) for key, ks_values in ks_stats.items()} size_stats = { key: np.mean([[size / len(phoneme_count[key[0]])] for size in sizes]) for key, sizes in size_stats.items() } output_sample_stats(ks_stats, size_stats, phoneme_count, args) # iterate over all phoneme inventories stats = {} for language, inventory in phoneme_count.items(): args.log.info("Processing inventory for %s...", language) lang_stats = analyze_inventory(inventory, language, args) stats[language] = lang_stats # Output statistics args.log.info("Writing results...") output_powerlaw_stats(stats, args)
def run(args): # Access the dataset: ds = get_dataset(args) print(ds.id) # and its CLDF Dataset: print(len(list(ds.cldf_reader()['LanguageTable']))) # Thanks to `PathType` `args.input_file` is a `pathlib.Path`: for c in args.input_file.read_text(encoding='utf8'): if args.strict: # evaluates our flag # The CLTS catalog API is available as `args.clts.api`: print(args.clts.api.bipa[c].name) # pragma: no cover else: args.log.warning('not very strict')
def run(args): ds = get_dataset(args) md = [] cldfs = list(ds.cldf_specs_dict.values()) if len(cldfs) > 1: md.append("# CLDF datasets\n") md.extend([ '- [{}](#ds-{})'.format(cldf.module, slug(cldf.metadata_fname)) for cldf in cldfs]) md.append('') for cldf in cldfs: if cldf.metadata_path.exists(): md.append('<a name="ds-{}"> </a>\n'.format(slug(cldf.metadata_fname))) res = metadata2markdown(cldf.get_dataset(), cldf.metadata_path) md.append(res.replace('# ', '# {} '.format(cldf.module), 1)) md.append('\n') ds.cldf_dir.joinpath('README.md').write_text('\n'.join(md), encoding='utf8')
def run(args): dataset = get_dataset(args) dataset.concepticon = args.concepticon.api dataset.glottolog = args.glottolog.api with_dataset(args, 'makecldf', dataset=dataset) if not dataset.cldf_dir.joinpath('sources.bib').exists(): raise ValueError('The dataset has no sources at {0}'.format( dataset.cldf_dir.joinpath('sources.bib'))) creators, contributors = dataset.get_creators_and_contributors( strict=False) def contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } with jsonlib.update_ordered(dataset.dir / '.zenodo.json', indent=4) as md: md.update({ 'title': dataset.metadata.title, "access_right": "open", "keywords": sorted( set(md.get('keywords', []) + ["linguistics", "cldf:Wordlist"])), "creators": [contrib(p) for p in creators], "contributors": [contrib(p) for p in contributors], "communities": sorted(md.get('communities', []) + [{ "identifier": "lexibank" }], key=lambda i: i['identifier']), "upload_type": "dataset", }) if dataset.metadata.citation: md['description'] = "<p>Cite the source of the dataset as:</p>\n\n" \ "<blockquote>\n<p>{}</p>\n</blockquote>".format( html.escape(dataset.metadata.citation)) if dataset.metadata.zenodo_license: md['license'] = {'id': dataset.metadata.zenodo_license}
def run(args): ds = get_dataset(args) clts = args.clts.api # Load the profile(s) specified for the dataset profiles = {k or 'default': v for k, v in ds.orthography_profile_dict.items()} forms = collections.defaultdict(list) if ds.cldf_dir.joinpath('forms.csv').exists(): for form in ds.cldf_reader()['FormTable']: forms[form.get('Profile')].append(ds.form_for_segmentation(form['Form'])) if list(forms.keys()) == [None]: # pragma: no cover forms['default'] = forms[None] for key, profile in profiles.items(): args.log.info('Processing {0}'.format(profile.fname)) profile.clean(clts, ipa_col=args.ipa) if args.trim: # Run the trimmer as many times as necessary until nothing more is left to remove total_removed = 0 while True: removed = profile.trim(ipa_col=args.ipa) total_removed += removed if removed == 0: break if total_removed: # pragma: no cover args.log.info("{} superfluous rules were removed.".format(total_removed)) if args.augment and forms[key]: profile.augment(forms[key], clts=args.clts.api) if args.sort: profile.sort(clts=args.clts.api, ipa_col=args.ipa) profile.check(clts, args.log, ipa_col=args.ipa) profile.write()
def run(args): ds = None if Zenodo.DOI_PATTERN.match(args.dataset): z = Zenodo() out = z.download_record(z.record_from_doi(args.dataset), pathlib.Path('.')) args.log.info('Downloaded files for {0} to {1}'.format( args.dataset, out)) cldf_ds = list(iter_datasets(out)) else: p = pathlib.Path(args.dataset) if p.exists() and sniff(p): cldf_ds = [Dataset.from_metadata(p)] else: # pragma: no cover ds = get_dataset(args) cldf_ds = [ds.cldf_reader()] if not cldf_ds: raise ValueError('No CLDF dataset found for spec {0}'.format( args.dataset)) try: count_p = max([len(list(cldf['ParameterTable'])) for cldf in cldf_ds]) except KeyError: count_p = 100 default_page_size = 100 while default_page_size < count_p and default_page_size < 600: default_page_size += 100 # pragma: no cover # max_returned_rows Maximum rows that can be returned from a table # or custom query (default=1000) db_paths = [] if args.db_path: # pragma: no cover if len(cldf_ds) > 1: raise ValueError( 'You cannot pass a db path, when multiple datasets are found') else: args.db_path = pathlib.Path( '{0}.sqlite'.format(ds.id if ds else 'cldf_db')) for i, cldf in enumerate(cldf_ds): if i == 0: db_path = args.db_path else: db_path = args.db_path.parent / ( args.db_path.stem + '_{0}'.format(i) + args.db_path.suffix) if not db_path.exists(): db = Database(cldf, fname=db_path, infer_primary_keys=True) db.write_from_tg() args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname)) db_paths.append(db_path) jsonlib.dump(datasette_cldf.metadata( {db.stem: cldf for db, cldf in zip(db_paths, cldf_ds)}), args.cfg_path, indent=4) os.system( 'datasette {0} -m {1} --template-dir {2} --config default_page_size:{3}' .format(' '.join(str(p) for p in db_paths), args.cfg_path, pathlib.Path(datasette_cldf.__file__).parent / 'templates', default_page_size))
def run(args): ds = get_dataset(args) p = ds.cldf_dir / ds.cldf_reader().properties['dc:hasPart']['summary']['dc:relation'] print(nexus.NexusReader(p).trees.trees[0].newick_tree.ascii_art())
def run(args): ds = get_dataset(args) ds_cldf = ds.cldf_reader() release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA) if ds_cldf.get('media.csv', None) is None: # pragma: no cover args.log.error('Dataset has no media.csv') raise ParserError if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi): args.log.error('Invalid passed DOI') raise ParserError if args.update_zenodo: if not release_dir.exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir)) raise ParserError if not (release_dir / ZENODO_FILE_NAME).exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir / ZENODO_FILE_NAME)) raise ParserError if args.create_release: args.log.error( 'You cannot create the release and update zenodo at the same time.' ) raise ParserError if args.create_release: if not args.parent_doi: args.log.error( 'The corresponding DOI is required (via --parent-doi).') raise ParserError mime_types = None if args.mimetype: mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] if args.list: size = collections.Counter() number = collections.Counter() else: media_dir = args.out / MEDIA media_dir.mkdir(exist_ok=True) media = [] if not args.update_zenodo: used_file_extensions = set() with UnicodeWriter(media_dir / INDEX_CSV if not args.list else None) as w: for i, row in enumerate( tqdm.tqdm([r for r in ds_cldf['media.csv']], desc='Getting {0} items'.format(MEDIA))): url = ds_cldf.get_row_url('media.csv', row) if isinstance(url, rfc3986.URIReference): url = url.normalize().unsplit() row['URL'] = url f_ext = url.split('.')[-1].lower() if args.debug and i > 500: break if (mime_types is None) or f_ext in mime_types\ or any(row['mimetype'].startswith(x) for x in mime_types): if args.list: m = '{0} ({1})'.format(row['mimetype'], f_ext) size[m] += int(row['size']) number.update([m]) else: used_file_extensions.add(f_ext.lower()) d = media_dir / row['ID'][:2] d.mkdir(exist_ok=True) fn = '.'.join([row['ID'], f_ext]) target = d / fn row['local_path'] = pathlib.Path(row['ID'][:2]) / fn if i == 0: w.writerow(row) w.writerow(row.values()) media.append(target) if (not target.exists()) or md5(target) != row['ID']: _create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k.ljust(20), str(number[k]), format_size(v)])) return # Waiting for the download threads to finish if 'download_threads' in globals(): for t in download_threads: t.join() if args.create_release: assert media_dir.exists(), 'No folder "{0}" found in {1}'.format( MEDIA, media_dir.resolve()) release_dir.mkdir(exist_ok=True) media.append(media_dir / INDEX_CSV) try: zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)), 'w', zipfile.ZIP_DEFLATED) fp = args.out for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)): zipf.write(str(f), str(os.path.relpath(str(f), str(fp)))) zipf.close() except Exception as e: args.log.error(e) raise def _contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } version_v = git_describe('.').split('-')[0] version = version_v.replace('v', '') git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '') with jsonlib.update(release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md: contribs = ds.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [_contrib(p) for p in creators] if contributors: md['contributors'] = [_contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] + \ COMMUNITIES if communities and not args.debug: md['communities'] = [{ "identifier": community_id } for community_id in sorted(set(communities))] md.update({ 'title': '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'dataset', 'publication_date': datetime.today().strftime('%Y-%m-%d'), 'version': version, 'related_identifiers': [ { 'scheme': 'url', 'identifier': '{0}/tree/{1}'.format(git_url, version_v), 'relation': 'isSupplementTo' }, ], }) if args.parent_doi: md['related_identifiers'].append({ 'scheme': 'doi', 'identifier': args.parent_doi, 'relation': 'isPartOf' }) supplement_to = " - Supplement to dataset " \ "<a href='https://doi.org/{0}'>{1}</a> ".format( args.parent_doi, ds.metadata.title) # noqa: E122 if ds.metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': ds.metadata.url, 'relation': 'isAlternateIdentifier' }) formats = ', '.join(sorted(used_file_extensions)) descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else '' online_url, online = '', '' if ds.metadata.url: online_url = ds.metadata.url online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format( online_url) md['description'] = html.escape( DESCRIPTION.format( url=online_url, formats=' ({0})'.format(formats) if formats else '', title=md['title'], supplement_to=supplement_to, descr=descr, online=online)) license_md = '' if ds.metadata.zenodo_license: md['license'] = {'id': ds.metadata.zenodo_license} license_md = LICENCE.format(ds.metadata.zenodo_license) DataDir(release_dir).write( 'README.md', README.format( title=md['title'], doi='https://doi.org/{0}'.format(args.parent_doi), ds_title=ds.metadata.title, license=license_md, formats=' ({0})'.format(formats) if formats else '', media=MEDIA, index=INDEX_CSV)) if args.update_zenodo: md = {} md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME)) if args.debug: api_url = API_URL_SANDBOX access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN') else: api_url = API_URL access_token = ACCESS_TOKEN zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=access_token) try: rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) except Exception as e: args.log.error( 'Check connection and credentials for accessing Zenodo.\n{0}'. format(e)) return latest_version = rec.links['latest'].split('/')[-1] if latest_version != args.update_zenodo: args.log.warn( 'Passed deposit ID does not refer to latest version {0}!'. format(latest_version)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Version: ' + rec.metadata.version) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')