def main(args): #get_obsolete_refs(args) with transaction.manager: #match_obsolete_refs(args) # # TODO: # - create bibtex file containing all refs to be removed! # - # matched = args.data_file(args.version, 'obsolete_refs_matched.json') if matched.exists(): with open(matched) as fp: matched = json.load(fp) else: matched = {} for id_, repl in matched.items(): if not repl: continue ref = Ref.get(id_, default=None) if ref is None: continue Config.add_replacement(ref, repl, session=DBSession, model=Source) DBSession.delete(ref)
def main(args): repls = set((i['id'], i['replacement']) for i in jsonload(args.data_dir.joinpath('scripts', 'monster-replacements.json'))) with transaction.manager: for ref_id, repl_id in repls: ref = Source.get('%s' % ref_id, default=None) if ref: Config.add_replacement( ref, '%s' % repl_id, session=DBSession, model=Source) # FIXME: "redirect" relations, e.g. from valuesetreference as well! DBSession.delete(ref) args.log.info('%s replacements' % len(repls))
def main(args): with open(args.data_file('2.3', 'obsolete_refs.json')) as fp: obsolete = json.load(fp) with transaction.manager: provider = Provider.get('glottolog20121') for ref in provider.refs: if ref.id in obsolete: Config.add_replacement(ref, None, session=DBSession, model=Source) DBSession.delete(ref) else: assert len(ref.providers) > 1 DBSession.flush() DBSession.delete(provider)
def prime_cache(args): # pragma: no cover """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ if 1: langs = {l.pk: l for l in DBSession.query(models.GrambankLanguage)} features = {f.pk: f for f in DBSession.query(models.Feature)} for lpk, nf in DBSession.query(common.ValueSet.language_pk, func.count(common.ValueSet.pk)) \ .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \ .group_by(common.ValueSet.language_pk): langs[lpk].representation = nf for fpk, nl in DBSession.query(common.ValueSet.parameter_pk, func.count(common.ValueSet.pk))\ .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk)\ .group_by(common.ValueSet.parameter_pk): features[fpk].representation = nl compute_language_sources() get_repos() for obj in DBSession.query(LanguageTreeLabel).all(): DBSession.delete(obj) for obj in DBSession.query(TreeLabel).all(): DBSession.delete(obj) for obj in DBSession.query(Phylogeny).all(): DBSession.delete(obj) DBSession.flush() for tree in tqdm( iter_trees([l.id for l in DBSession.query(common.Language)], Glottolog(REPOS['glottolog']))): nodes = set(n.name for n in tree.traverse()) phylo = Phylogeny(id=tree.name.split('_')[1], name=tree.name, newick=tree.write(format=9)) for l in DBSession.query(common.Language).filter( common.Language.id.in_(nodes)): LanguageTreeLabel(language=l, treelabel=TreeLabel(id=l.id, name=l.id, phylogeny=phylo)) DBSession.add(phylo)
def justifications(args, languages): """ - text goes into ValueSet.description - refs go into ValueSetReference objects """ def normalized_pages(s): if PAGES_PATTERN.match(s or ""): return s or "" # # create mappings to look up glottolog languoids matching names in justification files # langs_by_hid = languages langs_by_hname = {} langs_by_name = {} for l in DBSession.query(Languoid).filter(Languoid.active == False): langs_by_hname[l.jsondatadict.get("hname")] = l langs_by_hid[l.hid] = l langs_by_name[l.name] = l for l in DBSession.query(Languoid).filter(Languoid.active == True): langs_by_hname[l.jsondatadict.get("hname")] = l langs_by_hid[l.hid] = l langs_by_name[l.name] = l for id_, type_ in [("fc", "family"), ("sc", "subclassification")]: for i, row in enumerate(dsv.reader(args.data_file("%s_justifications.tab" % type_))): name = row[0] name = name.replace("_", " ") if not name.startswith("NOCODE") else name l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name))) if not l: args.log.warn("ignoring %s" % name) continue _r = 3 if type_ == "family" else 2 comment = (row[_r].strip() or None) if len(row) > _r else None if comment and not WORD_PATTERN.search(comment): comment = None # # TODO: look for [NOCODE_ppp] patterns as well!? # refs = [(int(m.group("id")), normalized_pages(m.group("comment"))) for m in REF_PATTERN.finditer(row[2])] vs = None for _vs in l.valuesets: if _vs.parameter.id == id_: vs = _vs break if not vs: args.log.info("%s %s ++" % (l.id, type_)) vs = ValueSet( id="%s%s" % (type_, l.id), description=comment, language=l, parameter=Parameter.get(id_), contribution=Contribution.first(), ) DBSession.add(Value(id="%s%s" % (type_, l.id), name="%s - %s" % (l.level, l.status), valueset=vs)) DBSession.flush() else: if vs.description != comment: args.log.info("%s %s ~~ description" % (l.id, type_)) vs.description = comment for r in vs.references: DBSession.delete(r) for r, pages in refs: vs.references.append(ValueSetReference(source=Source.get(str(r)), description=pages)) args.log.info("%s %s" % (i, type_))
def justifications(args, languages, stats): """ - text goes into ValueSet.description - refs go into ValueSetReference objects """ hh_bibkey_to_glottolog_id = {} for rec in get_bib(args): for provider, bibkeys in get_bibkeys(rec).items(): if provider == 'hh': for bibkey in bibkeys: hh_bibkey_to_glottolog_id[bibkey] = rec['glottolog_ref_id'] break def substitute_hh_bibkeys(m): return '**%s**' % hh_bibkey_to_glottolog_id[m.group('bibkey')] # # create mappings to look up glottolog languoids matching names in justification files # langs_by_hid = languages langs_by_hname = {} langs_by_name = {} # order by active to make sure, we active languoid overwrite the data of obsolete ones. for l in DBSession.query(Languoid).order_by(Languoid.active): langs_by_hname[l.jsondata.get('hname')] = l langs_by_hid[l.hid] = l langs_by_name[l.name] = l def normalize_pages(s): return (s or '').strip().rstrip(',') or None for id_, type_ in [('fc', 'family'), ('sc', 'subclassification')]: for i, row in enumerate(dsv.reader( args.data_dir.joinpath('languoids', 'forkel_%s_justifications-utf8.tab' % type_))): name = row[0] name = name.replace('_', ' ') if not name.startswith('NOCODE') else name l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name))) if not l: args.log.warn('ignoring %s' % name) continue _r = 3 if type_ == 'family' else 2 comment = (row[_r].strip() or None) if len(row) > _r else None if comment and not WORD_PATTERN.search(comment): comment = None if comment: comment = re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, comment) # # TODO: look for [NOCODE_ppp] patterns as well!? # refs = [(int(m.group('id')), normalize_pages(m.group('pages'))) for m in REF_PATTERN.finditer( re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, row[2]))] vs = None for _vs in l.valuesets: if _vs.parameter.id == id_: vs = _vs break if not vs: args.log.info('%s %s ++' % (l.id, type_)) vs = ValueSet( id='%s%s' % (id_, l.pk), description=comment, language=l, parameter=Parameter.get(id_), contribution=Contribution.first()) DBSession.add(Value( id='%s%s' % (id_, l.pk), name='%s - %s' % (l.level, l.status), valueset=vs)) DBSession.flush() else: if vs.description != comment: args.log.info('%s %s ~~ description: %s ---> %s' % (l.id, type_, vs.description, comment)) vs.description = comment stats.update(['justifications-%s' % type_]) for r in vs.references: DBSession.delete(r) for r, pages in refs: # FIXME: we must make sure not to link sources which will subsequently be # replaced! vs.references.append(ValueSetReference( source=Source.get(str(r)), description=pages)) args.log.info('%s %s' % (i, type_))