Пример #1
0
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gl_name = glottolog_name()
        gl_names = glottolog_names()

        languoids = {l.pk: l for l in DBSession.query(Languoid)}
        for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')):
            replacement = attrs.pop('replacement', None)
            hname = attrs.pop('hname', None)

            for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                if name in attrs:
                    attrs[name] = enum.from_string(attrs[name])

            l = languoids.get(attrs['pk'])
            if l:
                for k, v in attrs.items():
                    setattr(l, k, v)
                #
                # We do not assign ISO codes for existing languages, because it could be
                # that the ISO code is now assigned to a family node, due to a change
                # request, e.g. see https://github.com/clld/glottolog-data/issues/40
                #
                if len(l.hid or '') == 3 and not l.iso_code:
                    args.log.warn('Language with hid %s but no iso code!' % l.hid)
            else:
                l = Languoid(**attrs)
                DBSession.add(l)
                languoids[l.pk] = l

                if len(attrs.get('hid', '')) == 3:
                    create_identifier(
                        None, l, name=attrs['hid'], type=IdentifierType.iso.value)

                create_identifier(
                    gl_names.get(l.name),
                    l,
                    name=l.name,
                    description=gl_name.description,
                    type=gl_name.type)

            if hname:
                l.update_jsondata(hname=hname)

            if replacement:
                DBSession.add(Superseded(
                    languoid_pk=l.pk,
                    replacement_pk=replacement,
                    relation='classification update'))

            DBSession.flush()

        recreate_treeclosure()
Пример #2
0
def main(args):  # pragma: no cover
    recreate_treeclosure()
Пример #3
0
def main(args):  # pragma: no cover
    global MAX_IDENTIFIER_PK

    with transaction.manager:
        MAX_IDENTIFIER_PK = DBSession.query(
            Identifier.pk).order_by(desc(Identifier.pk)).first()[0]

        gc_names = {i.name: i for i in DBSession.query(Identifier).filter(
            Identifier.type == 'name').filter(Identifier.description == 'Glottolog')}

        ma_map = get_map(Macroarea)
        languoids = dict((l.pk, l) for l in DBSession.query(Languoid))
        with open(args.data_file(args.version, 'languoids.json')) as fp:
            for attrs in json.load(fp):
                ma = attrs.pop('macroarea', None)
                replacement = attrs.pop('replacement', None)
                hname = attrs.pop('hname', None)

                for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]:
                    if name in attrs:
                        attrs[name] = enum.from_string(attrs[name])

                l = languoids.get(attrs['pk'])
                if l:
                    for k, v in attrs.items():
                        if k == 'globalclassificationcomment':
                            continue
                        setattr(l, k, v)
                    if len(l.hid or '') == 3:
                        if not l.iso_code:
                            create_identifier(
                                None, l, name=l.hid, type=IdentifierType.iso.value)
                else:
                    l = Languoid(**attrs)
                    DBSession.add(l)
                    languoids[l.pk] = l

                    if len(attrs.get('hid', '')) == 3:
                        create_identifier(
                            None, l, name=attrs['hid'], type=IdentifierType.iso.value)
                    if ma:
                        l.macroareas.append(ma_map[ma])

                    create_identifier(
                        gc_names.get(l.name),
                        l,
                        name=l.name,
                        description='Glottolog',
                        type='name')

                if hname:
                    l.update_jsondata(hname=hname)

                if replacement:
                    DBSession.add(Superseded(
                        languoid_pk=l.pk,
                        replacement_pk=replacement,
                        relation='classification update'))

                DBSession.flush()

        recreate_treeclosure()
Пример #4
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    #
    # Now that we loaded all languoids and refs, we can compute the MED values.
    #
    meds = defaultdict(list)
    for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\
select
  l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages
from
  languagesource as ls,
  language as l,
  source as s,
  ref as r
where
  ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk
order by
  l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk
"""):
        meds[lpk].append((spk, sid, sname, med_type, year, pages))  # The last one is the overall MED

    # Now weed out the "newer but worse" sources:
    for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items():
        relevant, lastyear = [], 10000
        for spk, sid, sname, med_type, year, pages in sources:
            if year and year < lastyear:  # If year is more recent, this is a "newer but worse" item
                relevant.append((spk, sid, sname, med_type, year, pages))
                lastyear = year
        meds[lpk] = relevant

    med_param = common.Parameter.get('med')
    med_domain = {de.id: de for de in med_param.domain}
    contrib = common.Contribution.get('glottolog')

    for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))):
        l.update_jsondata(meds=[
            (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]])
        if not meds[l.pk]:
            continue

        med = meds[l.pk][0]
        # Record the overall MED as value for the 'med' Parameter:
        vs = common.ValueSet(
            id=idjoin('med', l.id),
            contribution=contrib,
            parameter=med_param,
            language=l,
        )
        DBSession.add(common.Value(
            id=idjoin('med', l.id),
            name=getattr(args.repos.med_types, med[3]).name,
            domainelement=med_domain[idjoin('med', med[3])],
            valueset=vs,
        ))
        DBSession.flush()
        DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk))

    recreate_treeclosure()

    macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\
select de.pk, de.id, de.name
from domainelement as de, parameter as p
where de.parameter_pk = p.pk and p.id = 'macroarea'
""")}

    for lid, lpk, cpk, ppk, mas in DBSession.execute("""\
select
  l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk)
from
  language as l,
  treeclosuretable as t,
  parameter as p,
  valueset as vs,
  value as v
where
  l.pk = t.parent_pk and
  t.child_pk = vs.language_pk and
  vs.parameter_pk = p.pk and
  p.id = 'macroarea' and
  v.valueset_pk = vs.pk and
  l.pk not in (
    select language_pk 
    from valueset as _vs, parameter as _p 
    where _vs.parameter_pk = _p.pk and _p.id = 'macroarea'
  )
group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""):
        for i, mapk in enumerate(mas):
            if i == 0:
                vs = common.ValueSet(
                    id=idjoin('macroarea', lid),
                    language_pk=lpk,
                    parameter_pk=ppk,
                    contribution_pk=cpk)
            DBSession.add(common.Value(
                id=idjoin(macroareas[mapk][0], lid),
                name=macroareas[mapk][1],
                domainelement_pk=mapk,
                valueset=vs))

    for vs in DBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == 'macroarea')\
            .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)):
        vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values])

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        raise ValueError(row)

    version = assert_release(args.repos.repos)
    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = version

    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}
    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}

    for vsid, vspk in valuesets.items():
        if vsid.startswith('macroarea-'):
            DBSession.add(common.ValueSetReference(
                source_pk=refs[args.repos.macroareas.__defaults__['reference_id']],
                valueset_pk=vspk))

    for vs in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == 'aes'):
        if vs.jsondata['reference_id']:
            DBSession.add(common.ValueSetReference(
                source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk))

    for lang in args.repos.languoids():
        if lang.category == args.repos.language_types.bookkeeping.category:
            continue
        clf = lang.classification_comment
        if clf:
            for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]:
                if getattr(clf, attr_ + 'refs'):
                    if split_items(lang.cfg['classification'][attr_ + 'refs']) != \
                            split_items(lang.cfg['classification'].get(attr_)):
                        vspk = valuesets['{0}-{1}'.format(pid, lang.id)]
                        for ref in getattr(clf, attr_ + 'refs'):
                            spk = refs.get(ref.key)
                            if spk:
                                DBSession.add(
                                    common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
Пример #5
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    recreate_treeclosure()

    for lpk, mas in DBSession.execute("""\
select
  l.pk, array_agg(distinct lma.macroarea_pk)
from
  language as l,
  treeclosuretable as t,
  languoidmacroarea as lma,
  macroarea as ma
where
  l.pk = t.parent_pk and
  t.child_pk = lma.languoid_pk and
  lma.macroarea_pk = ma.pk and
  l.pk not in (select languoid_pk from languoidmacroarea)
group by l.pk"""):
        for mapk in mas:
            DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk))

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = args.args[0]

    def items(s):
        if not s:
            return set()
        r = []
        for ss in set(s.strip().split()):
            if '**:' in ss:
                ss = ss.split('**:')[0] + '**'
            if ss.endswith(','):
                ss = ss[:-1].strip()
            r.append(ss)
        return set(r)

    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}
    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}

    for lang in args.repos.languoids():
        if lang.category == models.BOOKKEEPING:
            continue
        clf = lang.classification_comment
        if clf:
            if clf.subrefs:
                if items(lang.cfg['classification']['subrefs']) != \
                        items(lang.cfg['classification'].get('sub')):
                    vspk = valuesets['sc-{0}'.format(lang.id)]
                    for ref in clf.subrefs:
                        spk = refs.get(ref.key)
                        DBSession.add(
                            common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
Пример #6
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    recreate_treeclosure()

    for lpk, mas in DBSession.execute("""\
select
  l.pk, array_agg(distinct lma.macroarea_pk)
from
  language as l,
  treeclosuretable as t,
  languoidmacroarea as lma,
  macroarea as ma
where
  l.pk = t.parent_pk and
  t.child_pk = lma.languoid_pk and
  lma.macroarea_pk = ma.pk and
  l.pk not in (select languoid_pk from languoidmacroarea)
group by l.pk"""):
        for mapk in mas:
            DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk))

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    version = assert_release(args.repos.repos)
    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = version

    def items(s):
        if not s:
            return set()
        r = []
        for ss in set(s.strip().split()):
            if '**:' in ss:
                ss = ss.split('**:')[0] + '**'
            if ss.endswith(','):
                ss = ss[:-1].strip()
            r.append(ss)
        return set(r)

    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}
    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}

    for lang in args.repos.languoids():
        if lang.category == models.BOOKKEEPING:
            continue
        clf = lang.classification_comment
        if clf:
            if clf.subrefs:
                if items(lang.cfg['classification']['subrefs']) != \
                        items(lang.cfg['classification'].get('sub')):
                    vspk = valuesets['sc-{0}'.format(lang.id)]
                    for ref in clf.subrefs:
                        spk = refs.get(ref.key)
                        DBSession.add(
                            common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
            if clf.familyrefs:
                if items(lang.cfg['classification']['familyrefs']) != \
                        items(lang.cfg['classification'].get('family')):
                    vspk = valuesets['fc-{0}'.format(lang.id)]
                    for ref in clf.familyrefs:
                        spk = refs.get(ref.key)
                        if spk:
                            DBSession.add(
                                common.ValueSetReference(source_pk=spk, valueset_pk=vspk))