def test_Sources(tmpdir): src = Sources() src.add(BIB, Source( 'book', 'huber2005', author='Herrmann Huber', year='2005', title='y')) for entry in src: assert entry.genre == 'book' break assert len(list(src.items())) == 3 assert len(list(src.keys())) == 3 refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]'] assert src.format_refs(*list(src.expand_refs(refs))) == refs assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.' with pytest.raises(ValueError): src.add(5) with pytest.raises(ValueError): src.add('@misc{a.b,\n author="a.b"\n}') with pytest.raises(ValueError): _ = src['unknown'] assert _ # pragma: no cover with pytest.raises(ValueError): src.parse('a[x') with pytest.raises(ValueError): src.parse('[x]') with pytest.raises(ValueError): src.validate(['x']) bib = str(tmpdir / 'test.bib') src.write(bib) src2 = Sources() src2.read(bib) src2.write(bib, ids=['huber2005']) src = Sources.from_file(bib) assert len(src) == 1
def to_cldf(self, dest, mdname='cldf-metadata.json'): """ Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`. :param dest: :param mdname: :return: path of the metadata file """ dest = Path(dest) if not dest.exists(): dest.mkdir() data = self.read() if data[self.source_table_name]: sources = Sources() for src in data[self.source_table_name]: sources.add(Source( src['genre'], src['id'], **{k: v for k, v in src.items() if k not in ['id', 'genre']})) sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib')) for table_type, items in data.items(): try: table = self.dataset[table_type] table.common_props['dc:extent'] = table.write( [self.retranslate(table, item) for item in items], base=dest) except KeyError: assert table_type == self.source_table_name, table_type return self.dataset.write_metadata(dest / 'cldf-metadata.json')
def test_Sources_with_None_values(self): from pycldf.sources import Sources, Source src = Sources() src.add(Source('book', 'huber2005', title=None)) bib = self.tmp_path('test.bib') src.write(bib.name, bib.parent)
def test_Sources(self): from pycldf.sources import Sources, Source src = Sources() src.add(BIB, Source( 'book', 'huber2005', author='Herrmann Huber', year='2005', title='y')) self.assertEqual(len(list(src.items())), 3) self.assertEqual(len(list(src.keys())), 3) refs = 'huber2005[1-6];Obrazy;Elegie[34]' self.assertEqual(src.format_refs(*list(src.expand_refs(refs))), refs) self.assertEqual('%s' % src['huber2005'], 'Huber, Herrmann. 2005. y.') with self.assertRaises(ValueError): src.add(5) with self.assertRaises(ValueError): src.add('@misc{a.b,\n author="a.b"\n}') bib = self.tmp_path('test.bib') src.write(bib.name, bib.parent) src2 = Sources() src2.read(bib.name, bib.parent) bib = self.tmp_path('test.bib') src2.write(bib.name, bib.parent, ids=['huber2005']) src = Sources() src.read(bib.name, bib.parent) self.assertEqual(len(src), 1)
def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4): """ Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`. :param dest: :param mdname: :return: path of the metadata file """ dest = pathlib.Path(dest) if not dest.exists(): dest.mkdir() data = self.read() if data[self.source_table_name]: sources = Sources() for src in data[self.source_table_name]: sources.add(Source( src['genre'], src['id'], **{k: v for k, v in src.items() if k not in ['id', 'genre']})) sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib')) for table_type, items in data.items(): try: table = self.dataset[table_type] items = [ self.round_geocoordinates(item, precision=coordinate_precision) for item in items] table.common_props['dc:extent'] = table.write( [self.retranslate(table, item) for item in items], base=dest) except KeyError: assert table_type == self.source_table_name, table_type return self.dataset.write_metadata(dest / mdname)
def test_field_order(tmpdir): srcs = Sources() src = Source('misc', 'x') # src is an OrderedDict and we add title *after* year. src['year'] = '2018' src['title'] = 'The Title' srcs.add(src) bib = tmpdir / 'test.bib' srcs.write(str(bib)) res = bib.read_text(encoding='utf8') # Still, title should be printed in the BibTeX before year: assert res.index('title =') < res.index('year =')
def test_Sources_roundtrip_latex(tmpdir, bibtex, expected): src = Sources() src.add(bibtex) bib = tmpdir / 'test.bib' src.write(str(bib)) assert expected in bib.read_text('utf8')
def test_Sources_with_None_values(tmpdir): src = Sources() src.add(Source('book', 'huber2005', title=None)) bib = tmpdir / 'test.bib' src.write(str(bib))
class Dataset(object): """ API to access a CLDF dataset. """ def __init__(self, name): assert NAME_PATTERN.match(name) self.name = name self.sources = Sources() self.metadata = Metadata() self._rows = OrderedDict() # We store the fields (a.k.a. header) as tuple because it must be immutable after # first assignment (since changing is not well defined when there are already # rows). self._fields = () self._source_count = None self._cited_sources = set() self._table = None def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self.name) def __len__(self): """The length of a dataset is the number of rows in the values file.""" return len(self.rows) def __getitem__(self, item): """ Individual rows can be accessed by integer index or by row ID. :param item: `int` to access row by index, `str` to access by row ID :return: `OrderedDict` """ if isinstance(item, int): return self.rows[item] return self._rows[item] @property def fields(self): """ Read-only property to access the fields (a.k.a. header) defined for the dataset. :return: `tuple` of field names """ return self._fields @property def table(self): return self._table @fields.setter def fields(self, value): """ Fields can be assigned (but only once) for a dataset. :param value: `tuple` of field names. """ if self._fields: raise ValueError('fields can only be assigned once!') assert isinstance(value, tuple) assert all(any(field in value for field in variants) for variants in REQUIRED_FIELDS) table = self.metadata.get_table() if table: assert list(value) == list(table.schema.columns.keys()) else: table = self.metadata.add_table( 'values', '', [{'name': col, 'datatype': 'string'} for col in value]) table.schema.primaryKey = 'ID' self._table = table self._fields = value @property def rows(self): return list(self._rows.values()) @property def stats(self): return dict( languages=set(row['Language_ID'] for row in self.rows), parameters=set(row['Parameter_ID'] for row in self.rows), rowcount=( len(self), sum([1 for row in self.rows if row['Language_ID'] and row['Parameter_ID']])), values=Counter(row['Value'] for row in self.rows), ) def add_row(self, row): if not row: return d = ValuesRow.from_list(self, row) if d['ID'] in self._rows: raise ValueError('duplicate row ID: %s' % d['ID']) for ref in self.sources.expand_refs(d.get('Source', '')): self._cited_sources.add(ref.source.id) self._rows[d['ID']] = d return d @staticmethod def filename(fname, type_): """ Compute the path for optional CLDF files relative to a given values file. :param fname: Path of the values file :param type_: Type of the optional file :return: name of the optional file """ if type_ == 'sources': return fname.stem + '.bib' if type_ == 'metadata': return fname.stem + fname.suffix + MD_SUFFIX raise ValueError(type_) # pragma: no cover @staticmethod def _existing_file(fname): fname = Path(fname) assert fname.exists() and fname.is_file() return fname @classmethod def _from(cls, data, container=None, skip_on_error=False): container = container or data.parent dataset = cls(data.stem) dataset.metadata.read(Dataset.filename(data, 'metadata'), container) dataset._table = dataset.metadata.get_table() dataset.sources.read(Dataset.filename(data, 'sources'), container) delimiter = ',' if dataset.table: delimiter = dataset.table.dialect.delimiter if data.suffix in TAB_SUFFIXES: delimiter = '\t' if isinstance(container, Archive): rows = container.read_text(data.name).split('\n') else: rows = data for i, row in enumerate(reader(rows, delimiter=delimiter)): if i == 0: dataset.fields = tuple(row) else: try: dataset.add_row(row) except ValueError as e: if skip_on_error: log.warn('skipping row in line %s: %s' % (i + 1, e)) else: raise e dataset.table.dialect.delimiter = delimiter dataset.table.url = data.name return dataset @classmethod def from_zip(cls, fname, name=None): archive = Archive(cls._existing_file(fname)) return cls._from( Path(archive.metadata_name(prefix=name)[:-len(MD_SUFFIX)]), archive) @classmethod def from_metadata(cls, fname, container=None): fname = Path(fname) if not fname.name.endswith(MD_SUFFIX): raise ValueError('metadata file name must end with %s' % MD_SUFFIX) return cls._from( fname.parent.joinpath(fname.name[:-len(MD_SUFFIX)]), container=container) @classmethod def from_file(cls, fname, skip_on_error=False): """ Factory method to create a `Dataset` from a CLDF values file. :param fname: Path of the CLDF values file. :return: `Dataset` instance. """ return cls._from(cls._existing_file(fname), skip_on_error=skip_on_error) def write(self, outdir='.', suffix='.csv', cited_sources_only=False, archive=False): outdir = Path(outdir) if not outdir.exists(): raise ValueError(outdir.as_posix()) close = False if archive: if isinstance(archive, Archive): container = archive else: container = Archive(outdir.joinpath(self.name + '.zip'), mode='w') close = True else: container = outdir fname = Path(outdir).joinpath(self.name + suffix) if fname.suffix in TAB_SUFFIXES: self.table.dialect.delimiter = '\t' with UnicodeWriter( None if isinstance(container, Archive) else fname, delimiter=self.table.dialect.delimiter) as writer: writer.writerow(self.fields) for row in self.rows: writer.writerow(row.to_list()) if isinstance(container, Archive): container.write_text(writer.read(), fname.name) self.table.url = fname.name self.metadata.write(Dataset.filename(fname, 'metadata'), container) ids = self._cited_sources if cited_sources_only else None self.sources.write(Dataset.filename(fname, 'sources'), container, ids=ids) if close: container.close()