示例#1
0
def main(args):
    #get_obsolete_refs(args)
    with transaction.manager:
        #match_obsolete_refs(args)

        #
        # TODO:
        # - create bibtex file containing all refs to be removed!
        # -
        #
        matched = args.data_file(args.version, 'obsolete_refs_matched.json')
        if matched.exists():
            with open(matched) as fp:
                matched = json.load(fp)
        else:
            matched = {}

        for id_, repl in matched.items():
            if not repl:
                continue
            ref = Ref.get(id_, default=None)
            if ref is None:
                continue
            Config.add_replacement(ref, repl, session=DBSession, model=Source)
            DBSession.delete(ref)
def main(args):
    repls = set((i['id'], i['replacement']) for i in
                jsonload(args.data_dir.joinpath('scripts', 'monster-replacements.json')))

    with transaction.manager:
        for ref_id, repl_id in repls:
            ref = Source.get('%s' % ref_id, default=None)
            if ref:
                Config.add_replacement(
                    ref, '%s' % repl_id, session=DBSession, model=Source)
                # FIXME: "redirect" relations, e.g. from valuesetreference as well!
                DBSession.delete(ref)
    args.log.info('%s replacements' % len(repls))
def main(args):
    with open(args.data_file('2.3', 'obsolete_refs.json')) as fp:
        obsolete = json.load(fp)

    with transaction.manager:
        provider = Provider.get('glottolog20121')
        for ref in provider.refs:
            if ref.id in obsolete:
                Config.add_replacement(ref, None, session=DBSession, model=Source)
                DBSession.delete(ref)
            else:
                assert len(ref.providers) > 1

        DBSession.flush()
        DBSession.delete(provider)
示例#4
0
def prime_cache(args):  # pragma: no cover
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    if 1:
        langs = {l.pk: l for l in DBSession.query(models.GrambankLanguage)}
        features = {f.pk: f for f in DBSession.query(models.Feature)}

        for lpk, nf in DBSession.query(common.ValueSet.language_pk, func.count(common.ValueSet.pk)) \
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \
                .group_by(common.ValueSet.language_pk):
            langs[lpk].representation = nf

        for fpk, nl in DBSession.query(common.ValueSet.parameter_pk, func.count(common.ValueSet.pk))\
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk)\
                .group_by(common.ValueSet.parameter_pk):
            features[fpk].representation = nl

        compute_language_sources()

    get_repos()

    for obj in DBSession.query(LanguageTreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(TreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(Phylogeny).all():
        DBSession.delete(obj)
    DBSession.flush()

    for tree in tqdm(
            iter_trees([l.id for l in DBSession.query(common.Language)],
                       Glottolog(REPOS['glottolog']))):
        nodes = set(n.name for n in tree.traverse())
        phylo = Phylogeny(id=tree.name.split('_')[1],
                          name=tree.name,
                          newick=tree.write(format=9))
        for l in DBSession.query(common.Language).filter(
                common.Language.id.in_(nodes)):
            LanguageTreeLabel(language=l,
                              treelabel=TreeLabel(id=l.id,
                                                  name=l.id,
                                                  phylogeny=phylo))
        DBSession.add(phylo)
示例#5
0
def justifications(args, languages):
    """
    - text goes into ValueSet.description
    - refs go into ValueSetReference objects
    """

    def normalized_pages(s):
        if PAGES_PATTERN.match(s or ""):
            return s or ""

    #
    # create mappings to look up glottolog languoids matching names in justification files
    #
    langs_by_hid = languages
    langs_by_hname = {}
    langs_by_name = {}

    for l in DBSession.query(Languoid).filter(Languoid.active == False):
        langs_by_hname[l.jsondatadict.get("hname")] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    for l in DBSession.query(Languoid).filter(Languoid.active == True):
        langs_by_hname[l.jsondatadict.get("hname")] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    for id_, type_ in [("fc", "family"), ("sc", "subclassification")]:
        for i, row in enumerate(dsv.reader(args.data_file("%s_justifications.tab" % type_))):
            name = row[0]
            name = name.replace("_", " ") if not name.startswith("NOCODE") else name
            l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name)))
            if not l:
                args.log.warn("ignoring %s" % name)
                continue

            _r = 3 if type_ == "family" else 2
            comment = (row[_r].strip() or None) if len(row) > _r else None
            if comment and not WORD_PATTERN.search(comment):
                comment = None

            #
            # TODO: look for [NOCODE_ppp] patterns as well!?
            #

            refs = [(int(m.group("id")), normalized_pages(m.group("comment"))) for m in REF_PATTERN.finditer(row[2])]

            vs = None
            for _vs in l.valuesets:
                if _vs.parameter.id == id_:
                    vs = _vs
                    break

            if not vs:
                args.log.info("%s %s ++" % (l.id, type_))
                vs = ValueSet(
                    id="%s%s" % (type_, l.id),
                    description=comment,
                    language=l,
                    parameter=Parameter.get(id_),
                    contribution=Contribution.first(),
                )
                DBSession.add(Value(id="%s%s" % (type_, l.id), name="%s - %s" % (l.level, l.status), valueset=vs))
                DBSession.flush()
            else:
                if vs.description != comment:
                    args.log.info("%s %s ~~ description" % (l.id, type_))
                    vs.description = comment

            for r in vs.references:
                DBSession.delete(r)

            for r, pages in refs:
                vs.references.append(ValueSetReference(source=Source.get(str(r)), description=pages))

        args.log.info("%s %s" % (i, type_))
示例#6
0
def justifications(args, languages, stats):
    """
    - text goes into ValueSet.description
    - refs go into ValueSetReference objects
    """
    hh_bibkey_to_glottolog_id = {}
    for rec in get_bib(args):
        for provider, bibkeys in get_bibkeys(rec).items():
            if provider == 'hh':
                for bibkey in bibkeys:
                    hh_bibkey_to_glottolog_id[bibkey] = rec['glottolog_ref_id']
                break

    def substitute_hh_bibkeys(m):
        return '**%s**' % hh_bibkey_to_glottolog_id[m.group('bibkey')]

    #
    # create mappings to look up glottolog languoids matching names in justification files
    #
    langs_by_hid = languages
    langs_by_hname = {}
    langs_by_name = {}

    # order by active to make sure, we active languoid overwrite the data of obsolete ones.
    for l in DBSession.query(Languoid).order_by(Languoid.active):
        langs_by_hname[l.jsondata.get('hname')] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    def normalize_pages(s):
        return (s or '').strip().rstrip(',') or None

    for id_, type_ in [('fc', 'family'), ('sc', 'subclassification')]:
        for i, row in enumerate(dsv.reader(
                args.data_dir.joinpath('languoids', 'forkel_%s_justifications-utf8.tab' % type_))):
            name = row[0]
            name = name.replace('_', ' ') if not name.startswith('NOCODE') else name
            l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name)))
            if not l:
                args.log.warn('ignoring %s' % name)
                continue

            _r = 3 if type_ == 'family' else 2
            comment = (row[_r].strip() or None) if len(row) > _r else None
            if comment and not WORD_PATTERN.search(comment):
                comment = None
            if comment:
                comment = re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, comment)

            #
            # TODO: look for [NOCODE_ppp] patterns as well!?
            #

            refs = [(int(m.group('id')), normalize_pages(m.group('pages')))
                    for m in REF_PATTERN.finditer(
                    re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, row[2]))]

            vs = None
            for _vs in l.valuesets:
                if _vs.parameter.id == id_:
                    vs = _vs
                    break

            if not vs:
                args.log.info('%s %s ++' % (l.id, type_))
                vs = ValueSet(
                    id='%s%s' % (id_, l.pk),
                    description=comment,
                    language=l,
                    parameter=Parameter.get(id_),
                    contribution=Contribution.first())
                DBSession.add(Value(
                    id='%s%s' % (id_, l.pk),
                    name='%s - %s' % (l.level, l.status),
                    valueset=vs))
                DBSession.flush()
            else:
                if vs.description != comment:
                    args.log.info('%s %s ~~ description: %s ---> %s' % (l.id, type_, vs.description, comment))
                    vs.description = comment
                    stats.update(['justifications-%s' % type_])

            for r in vs.references:
                DBSession.delete(r)

            for r, pages in refs:
                # FIXME: we must make sure not to link sources which will subsequently be
                # replaced!
                vs.references.append(ValueSetReference(
                    source=Source.get(str(r)),
                    description=pages))

        args.log.info('%s %s' % (i, type_))