def __init__(self, dataset): self._count = defaultdict(int) self._cognate_count = defaultdict(int) self.dataset = dataset md = self.dataset.cldf_dir / MD_NAME if not md.exists(): md = self.dataset.cldf_dir / ALT_MD_NAME if not md.exists(): md = self.dataset.cldf_dir / MD_NAME copy(Path(__file__).parent / MD_NAME, md) self.wl = Wordlist.from_metadata(md) default_cldf = Wordlist.from_metadata( Path(__file__).parent / 'cldf-metadata.json') self.objects = {} self._obj_index = {} for cls in [ self.dataset.lexeme_class, self.dataset.language_class, self.dataset.concept_class, self.dataset.cognate_class, ]: self.objects[cls.__cldf_table__()] = [] self._obj_index[cls.__cldf_table__()] = set() cols = set( col.header for col in self.wl[cls.__cldf_table__()].tableSchema.columns) properties = set( col.propertyUrl.uri for col in self.wl[cls.__cldf_table__()].tableSchema.columns if col.propertyUrl) for field in cls.fieldnames(): try: col = default_cldf[cls.__cldf_table__(), field] # # We added Latitude and Longitude to the default metadata later, and want to # make sure, existing datasets are upgraded silently. # if field in ['Latitude', 'Longitude'] \ and cls.__cldf_table__() == 'LanguageTable': properties.add(col.propertyUrl.uri) self.wl[cls.__cldf_table__(), field].propertyUrl = col.propertyUrl self.wl[cls.__cldf_table__(), field].datatype = col.datatype except KeyError: col = Column(name=field, datatype="string") if (col.propertyUrl and col.propertyUrl.uri not in properties) or \ ((not col.propertyUrl) and (field not in cols)): self.wl[cls.__cldf_table__()].tableSchema.columns.append( col)
def test_modules(tmpdir): ds = Dataset(_make_tg(tmpdir)) assert ds.primary_table is None ds = Dataset(_make_tg(tmpdir, {"url": "data.csv"})) assert ds.primary_table is None ds = Dataset(_make_tg(tmpdir, { "url": "data.csv", "dc:conformsTo": "http://cldf.clld.org/v1.0/terms.rdf#ValueTable"})) assert ds.primary_table == 'ValueTable' assert Wordlist.in_dir(str(tmpdir)).primary_table assert Dictionary.in_dir(str(tmpdir)).primary_table assert StructureDataset.in_dir(str(tmpdir)).primary_table
def __enter__(self): super().__enter__() default_cldf = Wordlist.from_metadata( pathlib.Path(__file__).parent / MD_NAME) self._obj_index = {} for cls in [ self.dataset.lexeme_class, self.dataset.language_class, self.dataset.concept_class, self.dataset.cognate_class, ]: self.objects[cls.__cldf_table__()] = [] self._obj_index[cls.__cldf_table__()] = set() cols = set( col.header for col in self.cldf[cls.__cldf_table__()].tableSchema.columns) properties = set( col.propertyUrl.uri for col in self.cldf[cls.__cldf_table__()].tableSchema.columns if col.propertyUrl) for field in cls.fieldnames(): try: col = default_cldf[cls.__cldf_table__(), field] # # We added Latitude and Longitude to the default metadata later, and want to # make sure, existing datasets are upgraded silently. # if field in ['Latitude', 'Longitude'] \ and cls.__cldf_table__() == 'LanguageTable': # pragma: no cover properties.add(col.propertyUrl.uri) self.cldf[cls.__cldf_table__(), field].propertyUrl = col.propertyUrl self.cldf[cls.__cldf_table__(), field].datatype = col.datatype except KeyError: col = Column(name=field, datatype="string") if (col.propertyUrl and col.propertyUrl.uri not in properties) or \ ((not col.propertyUrl) and (field not in cols)): self.cldf[cls.__cldf_table__()].tableSchema.columns.append( col) return self
def main(args): Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\ .create(DBSession.bind) data = Data() dataset = common.Dataset( id=numerals.__name__, name="Numeralbank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain="numerals.clld.org", jsondata={ "license_icon": "cc-by.png", "license_name": "Creative Commons Attribution 4.0 International License", }, ) DBSession.add(dataset) for i, (id_, name) in enumerate( [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")] ): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) # Take meta data from curated CLDF data set ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Parameters: for parameter in ds["ParameterTable"]: data.add( models.NumberParameter, parameter["ID"], id=parameter["ID"], name="{0}".format(parameter["ID"]), concepticon_id=parameter['Concepticon_ID'], ) basis_parameter = data.add( models.NumberParameter, "0", id="0", name="Base", ) load_family_langs = [] for language in ds["LanguageTable"]: lang = data.add( models.Variety, language["ID"], id=language["ID"], name=language["Name"], latitude=language["Latitude"], longitude=language["Longitude"], creator=language["Contributor"], comment=language["Comment"], url_soure_name=language["SourceFile"], ) if language["Glottocode"]: load_family_langs.append((language["Glottocode"], lang)) # get orginal forms ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json') org_forms = {f["ID"]: f for f in ds["FormTable"]} d = data_repos[1] contrib = data.add( common.Contribution, d['id'], id=d['id'], name=d['name'] ) # process curated forms ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Add Base info if given for language in ds["LanguageTable"]: if language["Base"]: basis = language["Base"] de = data["DomainElement"].get(basis) if not de: de = data.add( common.DomainElement, basis, id=text_type(basis), name=text_type(basis), parameter=basis_parameter, ) vs = data.add( common.ValueSet, data["Variety"][language["ID"]].id, id=data["Variety"][language["ID"]].id, language=data["Variety"][language["ID"]], parameter=basis_parameter, contribution=contrib, ) common.Value( id=data["Variety"][language["ID"]].id, valueset=vs, domainelement=de ) # Forms: for form in ds["FormTable"]: valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"]) valueset = data["ValueSet"].get(valueset_id) # Unless we already have something in the VS: if not valueset: if form["Language_ID"] in data["Variety"]: vs = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data["Variety"][form["Language_ID"]], parameter=data["NumberParameter"][form["Parameter_ID"]], contribution=contrib, ) org_form = "" if form["ID"] in org_forms: if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]: org_form = org_forms[form["ID"]]["Form"] else: org_form = "no original form" DBSession.add( models.NumberLexeme( id=form["ID"], name=form["Form"], comment=form["Comment"], is_loan=form["Loan"], other_form=form["Other_Form"], org_form=org_form, is_problematic=form["Problematic"], valueset=vs, ) ) load_families( Data(), load_family_langs, glottolog_repos=gl_repos, strict=False, ) distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all() families = dict( zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties))) ) for l in DBSession.query(models.Variety): l.jsondata = {"color": families[l.family_pk]} p = common.Parameter.get("0") colors = color.qualitative_colors(len(p.domain)) for i, de in enumerate(p.domain): de.jsondata = {"color": colors[i]}
def ds_wl_notables(tmpdir): return Wordlist.in_dir(str(tmpdir), empty_tables=True)
def ds_wl(tmpdir): return Wordlist.in_dir(str(tmpdir))