def from_md(cls, fname): header, rows = next(iter_markdown_tables(fname.read_text(encoding='utf8'))) rows = [Contributor(**dict(zip([norm_header(c) for c in header], row))) for row in rows] byid = collections.Counter([r.id for r in rows]) if byid.most_common(1)[0][1] > 1: # pragma: no cover raise ValueError( 'duplicate ids: {0}'.format([k for k, v in byid.most_common() if v > 1])) return cls(rows)
def run(args): roles = collections.Counter() for c in args.repos.contributors: roles.update([c.bio]) for k, v in roles.most_common(): print(k, v) break return bios = {r[0]: (r[1], r[2]) for r in iter_html_data(args.html)} # id | Last name | First name | Node | Status | Language competence | GitHub-username | email header, rows = next( iter_markdown_tables( args.repos.path('CONTRIBUTORS_details.md').read_text( encoding='utf8'))) rows = [dict(zip(header, row)) for row in rows] rows = collections.OrderedDict([(r['id'], r) for r in rows]) contribs = args.repos.contributors contribs = collections.OrderedDict([(r.id, r) for r in contribs]) assert not set(rows) - set(contribs) lnames = {c.last_name: c.id for c in contribs.values()} fnames = { '{0.first_name} {0.last_name}'.format(c): c.id for c in contribs.values() } bios_by_id = {} for name in bios: if name in NAME2ID: bios_by_id[NAME2ID[name]] = bios[name] continue hname = HumanName(name) if hname.last in lnames: bios_by_id[lnames[hname.last]] = bios[name] continue full = '{0.first} {0.last}'.format(hname) if full in fnames: bios_by_id[fnames[full]] = bios[name] continue #print('---', name) for i, (cid, c) in enumerate(contribs.items()): bio, photo = bios_by_id.get(cid, (None, None)) if bio: assert '\n' not in bio and ('|' not in bio) md = rows.get(cid, {}) md['bio'] = bio md['photo'] = photo contrib = get_row(c, md) if i == 0: print(' | '.join(contrib.keys())) print(' | '.join([' --- ' for _ in range(len(contrib.keys()))])) print(' | '.join(contrib.values()))
def editors(self) -> typing.List[Editor]: res = [] header, rows = next( iter_markdown_tables( self.path('CONTRIBUTORS.md').read_text(encoding='utf8'))) for (period, name) in rows: start, to_, end = period.strip().partition('-') start, end = start.strip(), end.strip() res.append( Editor(name.strip(), start, start if not to_ else end or None)) return res
def read_editions(repos): head, rows = next( iter_markdown_tables( repos.path('CONTRIBUTORS.md').read_text(encoding='utf8'))) res = [] for row in rows: row = dict(zip([c.lower() for c in head], row)) row['editors'] = [n.strip() for n in row['editors'].split('&')] res.append(row) return sorted(res, key=lambda d: pkg_resources.parse_version(d['version']), reverse=True)
def cmd_makecldf(self, args): repos = Grambank(self.raw_dir / 'Grambank', wiki=self.raw_dir / 'grambank.wiki') create(args.writer.cldf, repos, args.glottolog.api) self.cldf_reader().validate(log=args.log) header, contribs = next( iter_markdown_tables( self.raw_dir.joinpath( 'Grambank', 'CONTRIBUTORS.md').read_text(encoding='utf8'))) self.dir.joinpath('CONTRIBUTORS.md').write_text( CONTRIBUTORS_TMPL.format('\n'.join([ '{First name} {Last name} | | author'.format( **dict(zip(header, row))) for row in contribs ])))
def get_creators_and_contributors(text, strict=True): ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES} creators, contributors = [], [] # Read first table in CONTRIBUTORS.md try: header, rows = next(iter_markdown_tables(text)) except StopIteration: # pragma: no cover return creators, contributors for row in rows: row = {k.lower(): v for k, v in zip(header, row)} for role in nfilter( [r.strip().lower() for r in row.get('role', '').split(',')]): c = {k: v for k, v in row.items() if k != 'role'} if role in {'author', 'creator', 'maintainer'}: if c not in creators: creators.append(c) else: if strict: c['type'] = ctypes[role] else: c['type'] = ctypes.get(role, 'Other') if c not in contributors: contributors.append(c) return creators, contributors
def main(args): for (org, repos), recs in itertools.groupby( sorted(oai.Records('tular'), key=lambda r: (r.repos.org, r.repos.repos, r.version), reverse=True), lambda r: (r.repos.org, r.repos.repos), ): if org == 'tupian-language-resources' and repos in DATASETS: DATASETS[repos] = next(recs) data = Data() dataset = data.add( common.Dataset, 'tular', id=tular.__name__, domain="tular.clld.org", name="TuLaR", description="Tupían Language Resources", publisher_name="Max-Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", license='https://creativecommons.org/licenses/by-sa/4.0/', contact="*****@*****.**", jsondata={ 'license_icon': 'cc-by-sa.png', 'license_name': 'Creative Commons Attribution-ShareAlike 4.0 International License' }, ) rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve() root = input('Project dir [{}]: '.format(str(rd))) root = pathlib.Path(root) if root else rd clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data') for db, rec in DATASETS.items(): print(db, rec.doi, rec.tag) dbdir = root.joinpath(db) assert dbdir.exists() md = jsonlib.load(dbdir / 'metadata.json') name = md['title'] if md['description']: name += ': {}'.format(md['description']) contribution = data.add( Database, db, id=db, name=name, description=rec.citation if rec else None, doi=rec.doi if rec else None, ) header, contribs = next( iter_markdown_tables( dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8'))) for i, contrib in enumerate(contribs): contrib = dict(zip(header, contrib)) cid = slug(HumanName(contrib['Name']).last) contributor = data['Contributor'].get(cid) if not contributor: contributor = data.add( common.Contributor, cid, id=cid, name=contrib['Name'], description=contrib.get('Affiliation'), ) DBSession.add( common.ContributionContributor( contribution=contribution, contributor=contributor, primary='author' in contrib['Role'].lower(), ord=i, )) for i, cid in enumerate( ['gerardi', 'reichert', 'aragon', 'list', 'forkel']): DBSession.add( common.Editor(contributor=data['Contributor'][cid], dataset=dataset, ord=i)) source_ids = list(add_sources(args.cldf.bibpath, DBSession)) sources = {s.id: s.pk for s in DBSession.query(common.Source)} subgroups = [] for row in args.cldf['LanguageTable']: if row['SubGroup'] not in subgroups: subgroups.append(row['SubGroup']) family = data['Family'].get(row['Family']) if (not family) and row['Family']: family = data.add(Family, row['Family'], id=slug(row['Family']), name=row['Family']) data.add( Doculect, row['ID'], id=row['ID'], name=row['Name'].replace('_', ' '), family=family, subfamily=row['SubGroup'], iso_code=row['ISO639P3code'], glotto_code=row['Glottocode'], longitude=row['Longitude'], latitude=row['Latitude'], jsondata=dict(icon=SUBGROUPS[row['SubGroup']]), ) tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' / 'Generic-metadata.json') seen = set() for row in tudet['ExampleTable']: if row['ID'] in seen: print('skipping duplicate sentence ID {}'.format(row['ID'])) continue seen.add(row['ID']) DBSession.add( Example(id=row['ID'], name=row['Primary_Text'], description=row['Translated_Text'], language=data['Doculect'][row['Language_ID']], conllu=row['conllu'])) contrib = data['Database']['tuled'] for row in args.cldf['ParameterTable']: data.add( Concept, row['ID'], id=row['ID'].split('_')[0], name=row['Name'], portuguese=row['Portuguese_Gloss'], semantic_field=row['Semantic_Field'], concepticon_class=row['Concepticon_ID'], eol=row['EOL_ID'], ) for (lid, pid), rows in itertools.groupby( sorted(args.cldf.iter_rows('FormTable', 'languageReference', 'parameterReference'), key=lambda r: (r['Language_ID'], r['Parameter_ID'])), lambda r: (r['Language_ID'], r['Parameter_ID']), ): vsid = '{}-{}'.format(lid, pid) vs = data.add( common.ValueSet, vsid, id=vsid, language=data['Doculect'][lid], parameter=data['Concept'][pid], contribution=contrib, ) refs = set() for row in rows: data.add( Word, row['ID'], id=row['ID'], valueset=vs, name=row['Form'], tokens=' '.join(row['Segments']), simple_cognate=int(row['SimpleCognate']), notes=row['Comment'], morphemes=' '.join(row['Morphemes']), partial_cognate=' '.join([k for k in row['PartialCognates']]) if row['PartialCognates'] else None, ) refs = refs.union(row['Source']) for ref in refs: if ref in source_ids: DBSession.add( common.ValueSetReference(valueset=vs, source_pk=sources[slug( ref, lowercase=False)])) load_inventories(args.cldf, clts, data['Doculect']) for row in args.cldf['CognateTable']: cc = data['Cognateset'].get(row['Cognateset_ID']) if not cc: cc = data.add( Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID'], name=row['Cognateset_ID'], contribution=contrib, ) data.add( Cognate, row['ID'], cognateset=cc, counterpart=data['Word'][row['Form_ID']], alignment=' '.join(row['Alignment'] or []), )