def test_Sources_with_None_values(self): from pycldf.sources import Sources, Source src = Sources() src.add(Source('book', 'huber2005', title=None)) bib = self.tmp_path('test.bib') src.write(bib.name, bib.parent)
def to_cldf(self, dest, mdname='cldf-metadata.json'): """ Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`. :param dest: :param mdname: :return: path of the metadata file """ dest = Path(dest) if not dest.exists(): dest.mkdir() data = self.read() if data[self.source_table_name]: sources = Sources() for src in data[self.source_table_name]: sources.add(Source( src['genre'], src['id'], **{k: v for k, v in src.items() if k not in ['id', 'genre']})) sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib')) for table_type, items in data.items(): try: table = self.dataset[table_type] table.common_props['dc:extent'] = table.write( [self.retranslate(table, item) for item in items], base=dest) except KeyError: assert table_type == self.source_table_name, table_type return self.dataset.write_metadata(dest / 'cldf-metadata.json')
def test_Source_expand_refs(): sources = Sources() src = Source( 'book', 'Meier2005', author='Hans Meier', year='2005', title='The Book') assert 'Meier2005' in repr(src) sources.add(src) bib = sources._bibdata.to_string(bib_format='bibtex') assert len(bib.split('author')) == 2 assert len(list(sources.expand_refs('Meier2005'))) == 1 bib = sources._bibdata.to_string(bib_format='bibtex') assert len(bib.split('author')) == 2 assert len(list(sources.expand_refs('12345'))) == 1
def test_Source_expand_refs(self): from pycldf.sources import Sources, Source sources = Sources() src = Source( 'book', 'Meier2005', author='Hans Meier', year='2005', title='The Book') self.assertIn('Meier2005', repr(src)) sources.add(src) bib = sources._bibdata.to_string(bib_format='bibtex') self.assertEqual(len(bib.split('author')), 2) self.assertEqual(len(list(sources.expand_refs('Meier2005'))), 1) bib = sources._bibdata.to_string(bib_format='bibtex') self.assertEqual(len(bib.split('author')), 2)
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ with self.cldf_writer(args) as writer: writer.cldf.add_component('CognatesetTable') writer.add_sources(*self.raw_dir.read_bib('cariban_resolved.bib')) cmap = writer.add_concepts(lookup_factory=lambda c: c.english) cmap['you'] = cmap['thou'] cmap['grease/fat'] = cmap['grease'] cmap['breast'] = cmap['breasts'] cmap['son'] = cmap['person'] data = pycldf.Dataset.from_metadata(self.raw_dir / 'cariban_data.json') for lang in data['LanguageTable']: writer.add_language(ID=lang['ID'], Name=lang['Name'], Glottocode=lang["Glottocode"]) cs_seen = set() reconstructions = { tuple(c['ID'].split('-')): c['Form'] for c in self.raw_dir.read_csv( 'cariban_lexical_reconstructions.csv', dicts=True) } for lex in self.raw_dir.read_csv('cariban_swadesh_list.csv', dicts=True): #"Language_ID","Swadesh_Nr","Feature_ID","Value","Cognateset_ID","Source","Comment","Full_Form" if lex['Feature_ID'] not in cmap: print(lex['Feature_ID']) continue for form in writer.add_lexemes( Value=lex['Value'], Parameter_ID=cmap[lex['Feature_ID']], Language_ID=lex['Language_ID'], Source=[ Reference(*d) for d in [Sources.parse(lex['Source'].replace(';', ','))] ] if lex['Source'] and not lex['Source'].startswith('pc') else [], ): cs_key = (lex['Feature_ID'], lex['Cognateset_ID']) cs_id = '{}-{}'.format(cmap[cs_key[0]], cs_key[1]) if cs_key not in cs_seen: writer.objects['CognatesetTable'].append( dict( ID=cs_id, Description=reconstructions.get(cs_key), )) cs_seen.add(cs_key) writer.add_cognate(lexeme=form, Cognateset_ID=cs_id) # Note: We want to re-use LanguageTable across the two CLDF datasets: LanguageTable = writer.cldf['LanguageTable'] with self.cldf_writer(args, cldf_spec='structure', clean=False) as writer: writer.cldf.add_component( LanguageTable) # we reuse the one from above!
def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4): """ Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`. :param dest: :param mdname: :return: path of the metadata file """ dest = pathlib.Path(dest) if not dest.exists(): dest.mkdir() data = self.read() if data[self.source_table_name]: sources = Sources() for src in data[self.source_table_name]: sources.add(Source( src['genre'], src['id'], **{k: v for k, v in src.items() if k not in ['id', 'genre']})) sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib')) for table_type, items in data.items(): try: table = self.dataset[table_type] items = [ self.round_geocoordinates(item, precision=coordinate_precision) for item in items] table.common_props['dc:extent'] = table.write( [self.retranslate(table, item) for item in items], base=dest) except KeyError: assert table_type == self.source_table_name, table_type return self.dataset.write_metadata(dest / mdname)
def __init__(self, name): assert NAME_PATTERN.match(name) self.name = name self.sources = Sources() self.metadata = Metadata() self._rows = OrderedDict() # We store the fields (a.k.a. header) as tuple because it must be immutable after # first assignment (since changing is not well defined when there are already # rows). self._fields = () self._source_count = None self._cited_sources = set() self._table = None
def test_field_order(tmpdir): srcs = Sources() src = Source('misc', 'x') # src is an OrderedDict and we add title *after* year. src['year'] = '2018' src['title'] = 'The Title' srcs.add(src) bib = tmpdir / 'test.bib' srcs.write(str(bib)) res = bib.read_text(encoding='utf8') # Still, title should be printed in the BibTeX before year: assert res.index('title =') < res.index('year =')
def import_values(values, lang, features, codes, contributors, sources): # pragma: no cover c = Contribution( id=lang['ID'], name='Dataset for {0}'.format(lang['Name']), ) for i, cid in enumerate(lang['Coders'], start=1): DBSession.add( ContributionContributor( contribution=c, contributor_pk=contributors[cid], ord=i, )) l = GrambankLanguage( id=lang['ID'], name=lang['Name'], macroarea=lang['Macroarea'], latitude=lang['Latitude'], longitude=lang['Longitude'], ) for value in values: vs = ValueSet( id=value['ID'], parameter_pk=features[value['Parameter_ID']], language=l, contribution=c, ) Value(id=value['ID'], valueset=vs, name=value['Value'], description=value['Comment'], domainelement_pk=codes[value['Code_ID'] or '{}-NA'.format(value['Parameter_ID'])]) if value['Source']: for ref in value['Source']: sid, pages = Sources.parse(ref) ValueSetReference(valueset=vs, source_pk=sources[sid], description=pages) DBSession.add(c)
def test_Sources_roundtrip_latex(tmpdir, bibtex, expected): src = Sources() src.add(bibtex) bib = tmpdir / 'test.bib' src.write(str(bib)) assert expected in bib.read_text('utf8')
def test_Sources_with_None_values(tmpdir): src = Sources() src.add(Source('book', 'huber2005', title=None)) bib = tmpdir / 'test.bib' src.write(str(bib))
def test_Sources(tmpdir): src = Sources() src.add(BIB, Source( 'book', 'huber2005', author='Herrmann Huber', year='2005', title='y')) for entry in src: assert entry.genre == 'book' break assert len(list(src.items())) == 3 assert len(list(src.keys())) == 3 refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]'] assert src.format_refs(*list(src.expand_refs(refs))) == refs assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.' with pytest.raises(ValueError): src.add(5) with pytest.raises(ValueError): src.add('@misc{a.b,\n author="a.b"\n}') with pytest.raises(ValueError): _ = src['unknown'] assert _ # pragma: no cover with pytest.raises(ValueError): src.parse('a[x') with pytest.raises(ValueError): src.parse('[x]') with pytest.raises(ValueError): src.validate(['x']) bib = str(tmpdir / 'test.bib') src.write(bib) src2 = Sources() src2.read(bib) src2.write(bib, ids=['huber2005']) src = Sources.from_file(bib) assert len(src) == 1
def __init__(self, tablegroup): self.tablegroup = tablegroup self.auto_constraints() self.sources = Sources.from_file(self.bibpath)
def test_Sources(self): from pycldf.sources import Sources, Source src = Sources() src.add(BIB, Source( 'book', 'huber2005', author='Herrmann Huber', year='2005', title='y')) self.assertEqual(len(list(src.items())), 3) self.assertEqual(len(list(src.keys())), 3) refs = 'huber2005[1-6];Obrazy;Elegie[34]' self.assertEqual(src.format_refs(*list(src.expand_refs(refs))), refs) self.assertEqual('%s' % src['huber2005'], 'Huber, Herrmann. 2005. y.') with self.assertRaises(ValueError): src.add(5) with self.assertRaises(ValueError): src.add('@misc{a.b,\n author="a.b"\n}') bib = self.tmp_path('test.bib') src.write(bib.name, bib.parent) src2 = Sources() src2.read(bib.name, bib.parent) bib = self.tmp_path('test.bib') src2.write(bib.name, bib.parent, ids=['huber2005']) src = Sources() src.read(bib.name, bib.parent) self.assertEqual(len(src), 1)
def cmd_makecldf(self, args): # Add sources sources = Sources.from_file(self.raw_dir / "sources.bib") args.writer.cldf.add_sources(*sources) glottolog = Glottolog(args.glottolog.dir) clts = CLTS(Config.from_file().get_clone('clts')) bipa = clts.bipa clts_saphon = clts.transcriptiondata_dict['saphon'] # Add components args.writer.cldf.add_columns("ValueTable", { "name": "Value_in_Source", "datatype": "string" }) cltstable = Terms()["cltsReference"].to_column().asdict() cltstable["datatype"]["format"] = "[a-z_-]+|NA" args.writer.cldf.add_columns('ParameterTable', cltstable, { 'name': 'CLTS_BIPA', 'datatype': 'string' }, { 'name': 'CLTS_Name', 'datatype': 'string' }) args.writer.cldf.add_component("LanguageTable", "Family", "Glottolog_Name") languages = [] #all_glottolog = {lng.id: lng for lng in glottolog.languoids()} #iso2glot = {lng.iso: lng.glottocode for lng in all_glottolog.values()} #args.log.info("loaded glottolog") for row in progressbar( self.etc_dir.read_csv("languages.csv", dicts=True)): #if row["SAPHON_Code"] in iso2glot: # glottocode = iso2glot[row["SAPHON_Code"]] #elif row["SAPHON_Code"][:3] in iso2glot: # glottocode = iso2glot[row["SAPHON_Code"][:3]] #else: # glottocode = "" #if glottocode and glottocode in all_glottolog: # lang = all_glottolog[glottocode] # update = { # "Family": lang.family.name if lang.family else '', # "Glottocode": glottocode, # "Latitude": lang.latitude, # "Longitude": lang.longitude, # "Macroarea": lang.macroareas[0].name if lang.macroareas else None, # "Glottolog_Name": lang.name, # } # row.update(update) languages.append(row) # Build source map from language source_map = { k: v for k, v in self.raw_dir.read_csv("references.tsv", delimiter="\t") } # Parse sources segments = [] values = [] counter = 1 unknowns = defaultdict(list) for lid, segment in self.raw_dir.read_csv('inventories.tsv', delimiter="\t"): normalized = normalize_grapheme(segment) if normalized in clts_saphon.grapheme_map: sound = bipa[clts_saphon.grapheme_map[normalized]] else: sound = bipa['<NA>'] unknowns[normalized] += [(lang_key, segment)] par_id = compute_id(normalized) if sound.type == 'unknownsound': bipa_grapheme = '' desc = '' else: bipa_grapheme = str(sound) desc = sound.name segments.append((par_id, normalized, bipa_grapheme, desc)) values.append({ "ID": str(counter), "Language_ID": lid, "Parameter_ID": par_id, "Value_in_Source": segment, "Value": normalized, "Source": [source_map[lid]] }) counter += 1 # Build segment data parameters = [{ "ID": ID, "Name": normalized, "Description": '', "CLTS_ID": desc.replace(' ', '_') if desc.strip() else "NA", "CLTS_BIPA": bipa_grapheme, "CLTS_Name": desc } for ID, normalized, bipa_grapheme, desc in set(segments)] # Write data and validate args.writer.write( **{ "ValueTable": values, "LanguageTable": languages, "ParameterTable": parameters, }) for g, rest in unknowns.items(): print('\t'.join([repr(g), str(len(rest)), g]))
def load(self, ds, args=None, verbose=False): """ Load a CLDF dataset into the database. :param dataset: :return: """ print(ds) try: self.fetchone('select ID from dataset') except sqlite3.OperationalError: self.create(force=True) self.unload(ds) dataset = ds.cldf_reader() tables, ref_tables = schema(dataset) # update the DB schema: for t in tables: if self._create_table_if_not_exists(t): continue db_cols = {k.lower(): v for k, v in self.tables[t.name].items()} for col in t.columns: if col.name.lower() not in db_cols: with self.connection() as conn: conn.execute( "ALTER TABLE {0} ADD COLUMN `{1.name}` {1.db_type}".format( t.name, col)) else: if db_cols[col.name.lower()] != col.db_type: raise ValueError( 'column {0}:{1} {2} redefined with new type {3}'.format( t.name, col.name, db_cols[col.name.lower()], col.db_type)) for t in ref_tables.values(): self._create_table_if_not_exists(t) self.update_schema() # then load the data: with self.connection() as db: db.execute('PRAGMA foreign_keys = ON;') insert( db, 'dataset', 'ID,name,version,metadata_json', ( ds.id, '{0}'.format(dataset), ds.repo.hash() if ds.repo else '', json.dumps(dataset.metadata_dict))) insert( db, 'datasetmeta', 'dataset_ID,key,value', *[(ds.id, k, '{0}'.format(v)) for k, v in dataset.properties.items()]) # load sources: rows = [] for src in dataset.sources.items(): values = [ds.id, src.id, src.genre] + [src.get(k) for k in BIBTEX_FIELDS] values.append( json.dumps({k: v for k, v in src.items() if k not in BIBTEX_FIELDS})) rows.append(tuple(values)) insert( db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'], *rows) # For regular tables, we extract and keep references to sources. refs = collections.defaultdict(list) for t in tables: # We want to lookup columns by the name used in the CLDF dataset. cols = {col.cldf_name: col for col in t.columns} # But we also want to look up primary keys by the database column name. cols_by_name = {col.name: col for col in t.columns} ref_table = ref_tables.get(t.name) rows, keys = [], [] try: for row in dataset[t.name]: keys, values = ['dataset_ID'], [ds.id] for k, v in row.items(): if ref_table and k == ref_table.consumes: col = cols_by_name[t.primary_key] refs[ref_table.name].append((row[col.cldf_name], v)) else: col = cols[k] if isinstance(v, list): v = (col.separator or ';').join( nfilter(col.convert(vv) for vv in v)) else: v = col.convert(v) # FIXME: only if non-local! keys.append("`{0}`".format(col.name)) values.append(v) keys, values = self.update_row(t.name, keys, values) rows.append(tuple(values)) insert(db, t.name, keys, *rows, **{'verbose': verbose}) except FileNotFoundError: # pragma: no cover if t.name != 'CognateTable': # An empty CognateTable is allowed. raise # pragma: no cover # Now insert the references, i.e. the associations with sources: for tname, items in refs.items(): rows = [] for oid, sources in items: for source in sources: sid, context = Sources.parse(source) rows.append([ds.id, oid, sid, context]) oid_col = '{0}_ID'.format(tname.replace('Source', '')) insert(db, tname, ['dataset_ID', oid_col, 'Source_ID', 'Context'], *rows) db.commit()
class Dataset(object): """ API to access a CLDF dataset. """ def __init__(self, name): assert NAME_PATTERN.match(name) self.name = name self.sources = Sources() self.metadata = Metadata() self._rows = OrderedDict() # We store the fields (a.k.a. header) as tuple because it must be immutable after # first assignment (since changing is not well defined when there are already # rows). self._fields = () self._source_count = None self._cited_sources = set() self._table = None def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self.name) def __len__(self): """The length of a dataset is the number of rows in the values file.""" return len(self.rows) def __getitem__(self, item): """ Individual rows can be accessed by integer index or by row ID. :param item: `int` to access row by index, `str` to access by row ID :return: `OrderedDict` """ if isinstance(item, int): return self.rows[item] return self._rows[item] @property def fields(self): """ Read-only property to access the fields (a.k.a. header) defined for the dataset. :return: `tuple` of field names """ return self._fields @property def table(self): return self._table @fields.setter def fields(self, value): """ Fields can be assigned (but only once) for a dataset. :param value: `tuple` of field names. """ if self._fields: raise ValueError('fields can only be assigned once!') assert isinstance(value, tuple) assert all(any(field in value for field in variants) for variants in REQUIRED_FIELDS) table = self.metadata.get_table() if table: assert list(value) == list(table.schema.columns.keys()) else: table = self.metadata.add_table( 'values', '', [{'name': col, 'datatype': 'string'} for col in value]) table.schema.primaryKey = 'ID' self._table = table self._fields = value @property def rows(self): return list(self._rows.values()) @property def stats(self): return dict( languages=set(row['Language_ID'] for row in self.rows), parameters=set(row['Parameter_ID'] for row in self.rows), rowcount=( len(self), sum([1 for row in self.rows if row['Language_ID'] and row['Parameter_ID']])), values=Counter(row['Value'] for row in self.rows), ) def add_row(self, row): if not row: return d = ValuesRow.from_list(self, row) if d['ID'] in self._rows: raise ValueError('duplicate row ID: %s' % d['ID']) for ref in self.sources.expand_refs(d.get('Source', '')): self._cited_sources.add(ref.source.id) self._rows[d['ID']] = d return d @staticmethod def filename(fname, type_): """ Compute the path for optional CLDF files relative to a given values file. :param fname: Path of the values file :param type_: Type of the optional file :return: name of the optional file """ if type_ == 'sources': return fname.stem + '.bib' if type_ == 'metadata': return fname.stem + fname.suffix + MD_SUFFIX raise ValueError(type_) # pragma: no cover @staticmethod def _existing_file(fname): fname = Path(fname) assert fname.exists() and fname.is_file() return fname @classmethod def _from(cls, data, container=None, skip_on_error=False): container = container or data.parent dataset = cls(data.stem) dataset.metadata.read(Dataset.filename(data, 'metadata'), container) dataset._table = dataset.metadata.get_table() dataset.sources.read(Dataset.filename(data, 'sources'), container) delimiter = ',' if dataset.table: delimiter = dataset.table.dialect.delimiter if data.suffix in TAB_SUFFIXES: delimiter = '\t' if isinstance(container, Archive): rows = container.read_text(data.name).split('\n') else: rows = data for i, row in enumerate(reader(rows, delimiter=delimiter)): if i == 0: dataset.fields = tuple(row) else: try: dataset.add_row(row) except ValueError as e: if skip_on_error: log.warn('skipping row in line %s: %s' % (i + 1, e)) else: raise e dataset.table.dialect.delimiter = delimiter dataset.table.url = data.name return dataset @classmethod def from_zip(cls, fname, name=None): archive = Archive(cls._existing_file(fname)) return cls._from( Path(archive.metadata_name(prefix=name)[:-len(MD_SUFFIX)]), archive) @classmethod def from_metadata(cls, fname, container=None): fname = Path(fname) if not fname.name.endswith(MD_SUFFIX): raise ValueError('metadata file name must end with %s' % MD_SUFFIX) return cls._from( fname.parent.joinpath(fname.name[:-len(MD_SUFFIX)]), container=container) @classmethod def from_file(cls, fname, skip_on_error=False): """ Factory method to create a `Dataset` from a CLDF values file. :param fname: Path of the CLDF values file. :return: `Dataset` instance. """ return cls._from(cls._existing_file(fname), skip_on_error=skip_on_error) def write(self, outdir='.', suffix='.csv', cited_sources_only=False, archive=False): outdir = Path(outdir) if not outdir.exists(): raise ValueError(outdir.as_posix()) close = False if archive: if isinstance(archive, Archive): container = archive else: container = Archive(outdir.joinpath(self.name + '.zip'), mode='w') close = True else: container = outdir fname = Path(outdir).joinpath(self.name + suffix) if fname.suffix in TAB_SUFFIXES: self.table.dialect.delimiter = '\t' with UnicodeWriter( None if isinstance(container, Archive) else fname, delimiter=self.table.dialect.delimiter) as writer: writer.writerow(self.fields) for row in self.rows: writer.writerow(row.to_list()) if isinstance(container, Archive): container.write_text(writer.read(), fname.name) self.table.url = fname.name self.metadata.write(Dataset.filename(fname, 'metadata'), container) ids = self._cited_sources if cited_sources_only else None self.sources.write(Dataset.filename(fname, 'sources'), container, ids=ids) if close: container.close()
def load(self, dataset): """ Load a CLDF dataset into the database. :param dataset: :return: """ tables, ref_tables = schema(dataset) # update the DB schema: for t in tables: if self._create_table_if_not_exists(t): continue db_cols = { r[1]: r[2] for r in self.fetchall("PRAGMA table_info({0})".format(t.name)) } for col in t.columns: if col.name not in db_cols: with self.connection() as conn: conn.execute( "ALTER TABLE {0} ADD COLUMN \"{1.name}\" {1.db_type}" .format(t.name, col)) else: if db_cols[col.name] != col.db_type: raise ValueError( 'column {0}:{1} {2} redefined with new type {3}'. format(t.name, col.name, db_cols[col.name], col.db_type)) for t in ref_tables.values(): self._create_table_if_not_exists(t) # then load the data: with self.connection() as db: db.execute('PRAGMA foreign_keys = ON;') pk = max([ r[0] for r in self.fetchall("SELECT ID FROM dataset", conn=db) ] or [0]) + 1 insert(db, 'dataset', 'ID,name,module,metadata_json', (pk, '{0}'.format(dataset), dataset.module, dumps(dataset.metadata_dict))) insert( db, 'datasetmeta', 'dataset_ID,key,value', *[(pk, k, '{0}'.format(v)) for k, v in dataset.properties.items()]) # load sources: rows = [] for src in dataset.sources.items(): values = [pk, src.id, src.genre ] + [src.get(k) for k in BIBTEX_FIELDS] values.append( dumps({ k: v for k, v in src.items() if k not in BIBTEX_FIELDS })) rows.append(tuple(values)) insert(db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'], *rows) # For regular tables, we extract and keep references to sources. refs = defaultdict(list) for t in tables: cols = {col.name: col for col in t.columns} ref_table = ref_tables.get(t.name) rows, keys = [], [] for row in dataset[t.name]: keys, values = ['dataset_ID'], [pk] for k, v in row.items(): if ref_table and k == ref_table.consumes: refs[ref_table.name].append( (row[t.primary_key], v)) else: col = cols[k] if isinstance(v, list): v = (col.separator or ';').join(col.convert(vv) for vv in v) else: v = col.convert(v) keys.append(k) values.append(v) rows.append(tuple(values)) insert(db, t.name, keys, *rows) # Now insert the references, i.e. the associations with sources: for tname, items in refs.items(): rows = [] for oid, sources in items: for source in sources: sid, context = Sources.parse(source) rows.append([pk, oid, sid, context]) oid_col = '{0}_ID'.format(tname.replace('Source', '')) insert(db, tname, [ 'dataset_ID', '{:}'.format(oid_col), 'Source_ID', 'Context' ], *rows) db.commit()